4 * Copyright (C) 2006-2009
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
8 * Orphan file management
10 * Inode 8 in the primary fileset is the orphan file.
11 * It records blocks in files that might need special care
12 * after a restart. The particular 'special care' depends on
13 * the block and the type of file. Some example are:
15 * The inode may have a linkcount of 0 and if so, it should
16 * be truncated and removed
17 * The inode might have blocks beyond EOF in which case they
18 * should be truncated.
20 * An internal chain from this block might be being released...
21 * see dir.c for further details.
23 * Each entry in the orphan file is 16 bytes holding 4 u32s
25 * These are fsnumber, inodenumber, blocknumber, data
26 * The data can be set by whoever creates the orphan.
28 * The orphan file is dense - no holes. When an entry
29 * is freed the last entry is moved down.
31 * When a block is in the orphan file, it has it's 'orphan_slot'
32 * set to record where the orphan entry is.
34 * When updating a block of the orphan file we must first make sure
35 * that it isn't dirty in the previous phase. If it is, we must wait
36 * for it to be written out. It is permitted to wait for this
37 * while holding the checkpoint lock.
41 * Orphan entries are always created as part of a commit transaction
42 * so we need the prepare/pin/commit/abort functions
48 void lafs_dump_orphans(void)
53 if (!dfs || !dfs->orphans) {
54 printk("No orphan FS !!\n");
58 mutex_lock_nested(&dfs->orphans->i_mutex, I_MUTEX_QUOTA);
60 om = &LAFSI(dfs->orphans)->md.orphan;
61 printk("nextfree = %u\n", (unsigned)om->nextfree);
62 printk("reserved = %u\n", (unsigned)om->reserved);
64 for (slot = 0; slot < om->nextfree; slot++) {
70 bnum = slot >> (dfs->blocksize_bits-4);
71 ob = lafs_get_block(dfs->orphans, bnum, NULL, GFP_KERNEL,
74 ent = slot - (bnum << (dfs->blocksize_bits-4));
75 printk("%3d: %d %u %u %u\n", slot,
76 or[ent].type, or[ent].filesys, or[ent].inum,
79 putdref(ob, MKREF(dumpo));
82 mutex_unlock(&dfs->orphans->i_mutex);
87 * Guide to MKREF reference names:
88 * "orphan_reserve". This is a block in the orphan file, and someone
89 * owns a reservation which would be commited by updating this block.
90 * There are "om->reserved" of these, typically all on the one block.
91 * "orphan" This is a block in the orphan file which holds an orphan
92 * record for some committed orphan. ->orphan_slot points here.
93 * "orphan_flag" This is a block (not in orphan file) which is an orphan, and
94 * has the B_Orphan flag set.
95 * "orphanx", "orphan_release", "orphan_release2", "orphan_move"
96 * "orphan_fs", "orphan_ino", "orphan_blk" Temporary references
99 /* Before locking the checkpoint, we need to reserve space to
100 * store the orphan record, and make sure we hold a reference
103 static int orphan_prepare(struct fs *fs, int async)
105 struct orphan_md *om = &LAFSI(fs->orphans)->md.orphan;
113 mutex_lock_nested(&fs->orphans->i_mutex, I_MUTEX_QUOTA);
114 bnum = (om->nextfree + om->reserved) >>
115 (fs->blocksize_bits-4);
116 b = lafs_get_block(fs->orphans, bnum, NULL, GFP_KERNEL,
117 MKREF(orphan_reserve));
119 mutex_unlock(&fs->orphans->i_mutex);
122 err = lafs_read_block_async(b);
124 err = lafs_read_block(b);
126 putdref(b, MKREF(orphan_reserve));
130 mutex_lock_nested(&fs->orphans->i_mutex, I_MUTEX_QUOTA);
132 mutex_unlock(&fs->orphans->i_mutex);
134 lafs_iolock_written(&b->b);
135 set_bit(B_PinPending, &b->b.flags);
136 lafs_iounlock_block(&b->b);
140 static struct datablock *orphan_pin(struct fs *fs, struct datablock *b)
142 struct orphan_md *om = &LAFSI(fs->orphans)->md.orphan;
146 struct datablock *ob;
149 bnum = slot >> (fs->blocksize_bits-4);
150 dprintk("slot=%d bnum=%d\n", slot, bnum);
151 ob = lafs_get_block(fs->orphans, bnum, NULL, GFP_KERNEL, MKREF(orphan));
152 /* that *must* succeed as we own a reference on the block already */
153 LAFS_BUG(ob == NULL, &b->b);
155 err = lafs_pin_dblock(ob, ReleaseSpace);
157 putdref(ob, MKREF(orphan));
163 static void orphan_commit(struct fs *fs, struct datablock *b, struct datablock *ob)
165 struct orphan_md *om = &LAFSI(fs->orphans)->md.orphan;
169 /* Committed to being an orphan now */
170 b->orphan_slot = om->nextfree++;
171 set_bit(B_Orphan, &b->b.flags);
172 getdref(b, MKREF(orphan_flag));
173 lafs_igrab_fs(b->b.inode);
174 lafs_add_orphan(fs, b);
176 dprintk("%p->orphan_slot=%d (%lu,%lu,%lu) %s\n", b, b->orphan_slot,
177 LAFSI(b->b.inode)->filesys->i_ino,
178 b->b.inode->i_ino, b->b.fileaddr, strblk(&b->b));
181 putdref(ob, MKREF(orphan_reserve));
184 ent = b->orphan_slot - (ob->b.fileaddr
185 << (fs->blocksize_bits-4));
186 or[ent].type = cpu_to_le32(1);
187 or[ent].filesys = cpu_to_le32(LAFSI(b->b.inode)->filesys->i_ino);
188 or[ent].inum = cpu_to_le32(b->b.inode->i_ino);
189 or[ent].addr = cpu_to_le32(b->b.fileaddr);
190 unmap_dblock(ob, or);
191 lafs_dirty_dblock(ob);
194 /* We failed to allocate space to write the update to the orphan
195 * block so we need to revert the reservation done in orphan_pin.
196 * Note that we don't 'unmark' the block from being an orphan.
197 * We let regular orphan processing find that out and release it.
200 static void orphan_abort(struct fs *fs)
204 struct orphan_md *om = &LAFSI(fs->orphans)->md.orphan;
207 bnum = (om->nextfree + om->reserved) >>
208 (fs->blocksize_bits-4);
209 b = lafs_get_block(fs->orphans, bnum, NULL, GFP_KERNEL,
211 /* Note that we now own two references to this block, one
212 * we just got and one we are trying to get rid of
214 putdref(b, MKREF(orphanx));
216 /* If this was the last block in the file,
217 * we need to punch a hole.
218 * i.e. if the first unneeded slot is at or before
219 * the first slot in this block, we don't want
222 if ((om->nextfree + om->reserved) <=
223 (bnum << (fs->blocksize_bits-4))
225 LAFS_BUG(!test_bit(B_PinPending, &b->b.flags), &b->b);
226 lafs_erase_dblock(b);
229 putdref(b, MKREF(orphan_reserve));
232 /* When we are about to make a change which might make a block
233 * into an 'orphan', we call lafs_make_orphan to record the fact.
234 * This sets B_Orphan, put the block on the fs->pending_orphans list
235 * and stores the fs/inode/block number in the orphan file.
236 * The handle_orphans routine for the relevant file type must ensure
237 * that orphan handling doesn't happen until the block is genuinely
239 * For directories, i_mutex is used for this.
240 * For inodes, B_IOLock is used.
242 int lafs_make_orphan(struct fs *fs, struct datablock *db)
245 struct datablock *ob;
247 if (test_bit(B_Orphan, &db->b.flags))
250 err = orphan_prepare(fs, 0);
254 lafs_checkpoint_lock(fs);
255 mutex_lock_nested(&fs->orphans->i_mutex, I_MUTEX_QUOTA);
256 ob = orphan_pin(fs, db);
257 if (IS_ERR(ob) && PTR_ERR(ob) == -EAGAIN) {
258 mutex_unlock(&fs->orphans->i_mutex);
259 lafs_checkpoint_unlock_wait(fs);
266 orphan_commit(fs, db, ob);
268 mutex_unlock(&fs->orphans->i_mutex);
269 lafs_checkpoint_unlock(fs);
273 /* non-blocking version of make_orphan */
274 int lafs_make_orphan_nb(struct fs *fs, struct datablock *db)
277 struct datablock *ob;
279 if (test_bit(B_Orphan, &db->b.flags))
282 err = orphan_prepare(fs, 1);
286 lafs_checkpoint_lock(fs);
287 if (!mutex_trylock(&fs->orphans->i_mutex)) {
288 lafs_checkpoint_unlock(fs);
291 ob = orphan_pin(fs, db);
296 orphan_commit(fs, db, ob);
298 mutex_unlock(&fs->orphans->i_mutex);
299 lafs_checkpoint_unlock(fs);
303 static void lafs_drop_orphan(struct fs *fs, struct datablock *db);
305 * When any processing of an orphan makes it not an orphan any more
306 * (e.g. link is created for a file, directory block is cleaned)
307 * we need to delete the entry from the orphan file.
308 * If this isn't the last entry in the file we swap the last entry
309 * in thus keeping the file dense.
310 * We don't have the blocks in the orphan file reserved. If we
311 * cannot get a reservation we fail to release the orphan status
312 * so we can try again later.
313 * All IO requests here must be async as we run from the cleaner
315 * We can block in orphan->i_mutex or even in erase_dblock in
316 * orphan_abort, but i_mutex is only held for very short periods
317 * like a spinlock, and erase_dblock should not block as the block
318 * should be PinPending;
320 void lafs_orphan_release(struct fs *fs, struct datablock *b)
322 struct datablock *ob1, *ob2;
323 int shift = b->b.inode->i_blkbits - 4;
324 struct orphan_md *om = &LAFSI(fs->orphans)->md.orphan;
325 struct orphan *or, *lastor, last;
328 if (!test_bit(B_Orphan, &b->b.flags))
331 ob1 = lafs_get_block(fs->orphans, b->orphan_slot >> shift, NULL,
332 GFP_KERNEL, MKREF(orphan_release));
333 /* We already own an 'orphan' reference on this block, so we
334 * don't need this one
336 LAFS_BUG(ob1 == NULL, &b->b);
337 putdref(ob1, MKREF(orphan_release));
339 BUG_ON(!test_bit(B_PinPending, &ob1->b.flags));
341 lafs_checkpoint_lock(fs);
343 if (lafs_pin_dblock(ob1, ReleaseSpace) < 0)
346 /* Note that orphan_slot can change spontaneously unless
347 * orphans->i_mutex is held
349 mutex_lock_nested(&fs->orphans->i_mutex, I_MUTEX_QUOTA);
350 ent = b->orphan_slot & ((1<<shift)-1);
352 dprintk("os=%d sh=%d ent=%d nf=%d\n", b->orphan_slot, shift,
355 if (b->orphan_slot != om->nextfree-1) {
356 /* need to swap in the last entry */
358 struct datablock *bbl;
360 /* All blocks in the orphan file a referenced either by
361 * an orphan or by a reservation, so this must succeed.
363 ob2 = lafs_get_block(fs->orphans, (om->nextfree-1) >> shift,
364 NULL, GFP_KERNEL, MKREF(orphan_move));
365 LAFS_BUG(ob2 == NULL, &b->b);
367 LAFS_BUG(!test_bit(B_Valid, &ob2->b.flags), &ob2->b);
368 LAFS_BUG(!test_bit(B_PinPending, &ob2->b.flags), &ob2->b);
370 if (lafs_pin_dblock(ob2, ReleaseSpace) < 0) {
371 putdref(ob2, MKREF(orphan_move));
375 lastor = map_dblock_2(ob2);
376 lastent = (om->nextfree-1) & ((1<<shift)-1);
377 dprintk("lastor=%p lastent=%d\n", lastor, lastent);
378 last = lastor[lastent];
379 lastor[lastent].type = 0;
380 unmap_dblock_2(ob2, lastor);
382 if (last.type == 0) {
383 /* this entry was invalidated during read-in.
384 * lets just forget it
389 putdref(ob2, MKREF(orphan_move));
393 /* The block, being an orphan, must exist */
394 bino = lafs_iget_fs(fs,
395 le32_to_cpu(last.filesys),
396 le32_to_cpu(last.inum), ASYNC);
397 LAFS_BUG(!bino, &b->b);
398 bbl = lafs_get_block(bino,
399 le32_to_cpu(last.addr),
400 NULL, GFP_KERNEL, MKREF(orphan_blk));
401 LAFS_BUG(!bbl, &b->b);
403 dprintk("Q %d %d\n", bbl->orphan_slot, om->nextfree);
404 LAFS_BUG(bbl->orphan_slot != om->nextfree-1, &bbl->b);
406 or = map_dblock(ob1);
408 unmap_dblock(ob1, or);
409 bbl->orphan_slot = b->orphan_slot;
410 putdref(bbl, MKREF(orphan_blk));
413 lafs_dirty_dblock(ob1);
414 lafs_dirty_dblock(ob2);
416 /* The orphan reference on ob2 has moved to ob1
417 * and ob2 is now a 'reservation' reference.
418 * So we drop ob2 and extra time, but not ob1 at all.
420 getdref(ob2, MKREF(orphan_reserve));
421 putdref(ob2, MKREF(orphan_move));
422 putdref(ob2, MKREF(orphan));
424 or = map_dblock(ob1);
426 unmap_dblock(ob1, or);
427 lafs_dirty_dblock(ob1);
428 getdref(ob1, MKREF(orphan_reserve));
429 putdref(ob1, MKREF(orphan));
433 clear_bit(B_Orphan, &b->b.flags);
434 lafs_iput_fs(b->b.inode);
435 lafs_drop_orphan(fs, b);
436 putdref(b, MKREF(orphan_flag));
438 /* Now drop the reservation we just synthesised */
441 mutex_unlock(&fs->orphans->i_mutex);
443 lafs_checkpoint_unlock(fs);
446 static struct inode *orphan_inode(struct datablock *db)
448 struct inode *ino, *rv = NULL;
449 switch (LAFSI(db->b.inode)->type) {
451 ino = rcu_my_inode(db);
453 test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
454 /* don't need rcu while I_Deleting is set */
463 return igrab(db->b.inode);
469 static void orphan_iput(struct inode *ino)
473 if (test_bit(I_Deleting, &LAFSI(ino)->iflags))
478 long lafs_run_orphans(struct fs *fs)
480 struct datablock *db;
481 struct list_head done;
482 long timeout = MAX_SCHEDULE_TIMEOUT;
484 if (list_empty_careful(&fs->pending_orphans))
485 return MAX_SCHEDULE_TIMEOUT;
487 INIT_LIST_HEAD(&done);
488 /* Now process the orphans. Move each one to 'done',
489 * then handle it. Those which remain on 'done' get
490 * moved back at the end.
493 set_bit(OrphansRunning, &fs->fsstate);
494 spin_lock(&fs->lock);
495 while (!list_empty(&fs->pending_orphans)) {
497 db = list_entry(fs->pending_orphans.next,
500 list_move(&db->orphans, &done);
501 getdref(db, MKREF(run_orphans));
502 spin_unlock(&fs->lock);
503 ino = orphan_inode(db);
504 if (!ino && !test_bit(B_Claimed, &db->b.flags))
505 /* OK not to have an inode */;
506 else if (!ino || !mutex_trylock(&ino->i_mutex)) {
507 /* we cannot get a wakeup when mutex
508 * is unlocked, so we need timeout.
511 timeout = msecs_to_jiffies(500);
512 putdref(db, MKREF(run_orphans));
513 spin_lock(&fs->lock);
518 lafs_orphan_forget(fs, db);
520 int err = -ERESTARTSYS;
521 while (err == -ERESTARTSYS) {
522 switch(LAFSI(db->b.inode)->type) {
524 err = lafs_inode_handle_orphan(db);
527 err = lafs_dir_handle_orphan(db);
531 timeout = msecs_to_jiffies(500);
533 mutex_unlock(&ino->i_mutex);
537 putdref(db, MKREF(run_orphans));
538 spin_lock(&fs->lock);
540 list_splice(&done, &fs->pending_orphans);
541 spin_unlock(&fs->lock);
542 clear_bit(OrphansRunning, &fs->fsstate);
544 if (test_bit(CleanerDisabled, &fs->fsstate)) {
545 struct datablock *db;
546 printk("Orphan sleeping:\n");
547 list_for_each_entry(db, &fs->pending_orphans,
549 printk("orph=%s\n", strblk(&db->b));
552 if (list_empty_careful(&fs->pending_orphans))
553 /* unmount might be waiting... */
554 wake_up(&fs->async_complete);
558 static void lafs_drop_orphan(struct fs *fs, struct datablock *db)
560 /* This block was an orphan but isn't any more.
561 * Remove it from the list.
562 * It must be IOlocked, and this call must not come
563 * from lafs_run_orphans.
565 struct inode *ino = orphan_inode(db);
567 BUG_ON(ino && !mutex_is_locked(&ino->i_mutex));
570 if (test_bit(B_Orphan, &db->b.flags))
572 spin_lock(&fs->lock);
573 if (!test_bit(B_Orphan, &db->b.flags) &&
574 !list_empty_careful(&db->orphans) &&
575 !test_bit(B_Cleaning, &db->b.flags))
576 list_del_init(&db->orphans);
577 spin_unlock(&fs->lock);
580 void lafs_add_orphan(struct fs *fs, struct datablock *db)
582 /* This block is an orphan block but might not be
583 * on the list any more.
584 * It now needs orphan processing (probably nlink
585 * has changed, or I_Deleting), so put it back
588 LAFS_BUG(!test_bit(B_Orphan, &db->b.flags), &db->b);
589 spin_lock(&fs->lock);
590 if (list_empty_careful(&db->orphans))
591 list_add_tail(&db->orphans, &fs->pending_orphans);
592 spin_unlock(&fs->lock);
593 lafs_wake_thread(fs);
596 void lafs_orphan_forget(struct fs *fs, struct datablock *db)
598 /* This is still an orphan, but we don't want to handle
599 * it just now. When we do, lafs_add_orphan will be called */
600 LAFS_BUG(!test_bit(B_Orphan, &db->b.flags), &db->b);
601 spin_lock(&fs->lock);
602 if (!test_bit(B_Cleaning, &db->b.flags) &&
603 !list_empty_careful(&db->orphans))
604 list_del_init(&db->orphans);
605 spin_unlock(&fs->lock);
608 struct datablock *lafs_find_orphan(struct inode *ino)
610 /* I could walk the child tree of the inode, or
611 * walk the pending_orphan list looking for an
612 * orphan for this inode.
613 * The latter seems easier.
614 * Performance will be quadratic in the size of the
615 * orphan list, so we could possibly consider
616 * improvements later.
618 struct fs *fs = fs_from_inode(ino);
619 struct datablock *db;
621 spin_lock(&fs->lock);
622 list_for_each_entry(db, &fs->pending_orphans, orphans)
623 if (db->b.inode == ino) {
624 spin_unlock(&fs->lock);
627 spin_unlock(&fs->lock);
631 int lafs_count_orphans(struct inode *ino)
633 /* Count how many active orphan slots there are
634 * in this file (which is the orphan file).
635 * We simply iterate all the slots until we
637 * We keep references to the blocks so lafs_add_orphans
638 * is certain to find them.
641 int slots = 1 << (ino->i_blkbits - 4);
645 struct datablock *db;
651 db = lafs_get_block(ino, bnum, NULL, GFP_KERNEL, MKREF(count_orphans));
654 if (lafs_read_block(db) < 0) {
655 putdref(db, MKREF(count_orphans));
660 for (slot = 0; slot < slots ; slot++)
661 if (or[slot].type == 0)
664 unmap_dblock(db, or);
666 putdref(db, MKREF(count_orphans));
667 } while (slot == slots);
669 return bnum * slots + slot;
672 void lafs_add_orphans(struct fs *fs, struct inode *ino, int count)
674 int slots = 1 << (ino->i_blkbits - 4);
675 int last_block = (count + slots - 1) >> (ino->i_blkbits - 4);
678 for (bnum = 0; bnum < last_block; bnum++) {
679 struct datablock *db = lafs_get_block(ino, bnum, NULL, GFP_KERNEL, MKREF(add_orphans));
680 int bslot = bnum * slots;
684 /* We already own a count_orphans ref on this block */
686 LAFS_BUG(!test_bit(B_Valid, &db->b.flags), &db->b);
687 putdref(db, MKREF(count_orphans));
690 for (slot = 0; slot < slots && bslot + slot < count; slot++) {
692 struct datablock *ob;
694 LAFS_BUG(or[slot].type == 0, &db->b);
696 oino = lafs_iget_fs(fs,
697 le32_to_cpu(or[slot].filesys),
698 le32_to_cpu(or[slot].inum),
705 ob = lafs_get_block(oino, le32_to_cpu(or[slot].addr), NULL,
706 GFP_KERNEL, MKREF(add_orphan));
709 if (!test_and_set_bit(B_Orphan, &ob->b.flags)) {
710 getdref(ob, MKREF(orphan_flag));
711 getdref(db, MKREF(orphan));
712 ob->orphan_slot = bslot + slot;
713 lafs_igrab_fs(ob->b.inode);
714 lafs_add_orphan(fs, ob);
717 putdref(ob, MKREF(add_orphan));
721 unmap_dblock(db, or);
722 putdref(db, MKREF(add_orphans));