3 * Handle data blocks for LaFS
5 * Copyright (C) 2004-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
11 #include <linux/buffer_head.h> /* for try_to_release_page */
12 #include <linux/slab.h>
14 * Data blocks are stored in a regular address space for the
15 * relevant file. A page may have multiple datablocks, but
16 * a datablock cannot extend beyond one page
17 * Each datablock has (or can have) a 'struct datablock' attribute
18 * structure. These are allocated as an array per page, and attached
19 * at the ->private pointer on the page.
24 static DEFINE_SPINLOCK(refl);
25 void add_ref(struct block *b, char *ref, char *file, int line)
30 for (i = 0; i < 16; i++) {
31 if (b->holders[i].cnt) {
32 if (strcmp(b->holders[i].name, ref) == 0) {
42 printk(KERN_ERR "LaFS: add_ref all holders are in use"
43 " at %s:%d\nblock: %s\n",
44 file, line, strblk(b));
47 b->holders[z].cnt = 1;
48 b->holders[z].name = ref;
52 void del_ref(struct block *b, char *ref, char *file, int line)
56 for (i = 0; i < 16; i++) {
57 if (b->holders[i].cnt &&
58 strcmp(b->holders[i].name, ref) == 0) {
66 printk(KERN_ERR "LaFS: holder %s not found at %s:%d\nblk: %s\n",
67 ref, file, line, strblk(b));
71 int has_ref(struct block *b, char *ref)
75 for (i = 0; i < 16; i++)
76 if (b->holders[i].cnt &&
77 strcmp(b->holders[i].name, ref) == 0)
84 /* Based on grow_dev_page */
86 lafs_get_block(struct inode *ino, unsigned long index, struct page *p,
89 struct datablock *b = NULL;
90 int bits = PAGE_SHIFT - ino->i_blkbits;
93 p = find_get_page(&ino->i_data, index >> bits);
95 spin_lock(&ino->i_data.private_lock);
97 b = (struct datablock *)p->private;
98 b += index & ((1<<bits)-1);
99 getdref_locked(b, REF);
101 spin_unlock(&ino->i_data.private_lock);
102 page_cache_release(p);
109 p = find_or_create_page(&ino->i_data, index>>bits, gfp);
111 dprintk("find or create returned NULL\n");
115 if (!PagePrivate(p)) {
116 unsigned long ind = p->index << bits;
118 /* New page, need to set up attribute blocks */
119 /* FIXME use kmem_cache */
120 dprintk("setting up %p for %lu\n", p, index);
121 b = kzalloc(sizeof(struct datablock)<<bits, gfp);
125 page_cache_release(p);
130 for (i = 0; i < (1<<bits); i++) {
132 atomic_set(&b[i].b.refcnt, 0);
135 set_bit(B_Valid, &b[i].b.flags);
136 b[i].b.fileaddr = ind++;
139 b[i].b.parent = NULL;
140 INIT_LIST_HEAD(&b[i].b.siblings);
141 INIT_LIST_HEAD(&b[i].b.lru);
142 INIT_LIST_HEAD(&b[i].b.peers);
143 INIT_LIST_HEAD(&b[i].orphans);
144 INIT_LIST_HEAD(&b[i].cleaning);
147 b[i].my_inode = NULL;
150 spin_lock(&ino->i_data.private_lock);
151 if (!PagePrivate(p)) {
152 p->private = (unsigned long) b;
156 spin_unlock(&ino->i_data.private_lock);
160 b = (struct datablock *)p->private;
161 b += index & ((1<<bits)-1);
162 /* spinlock is just to sync with lafs_refile */
163 spin_lock(&ino->i_data.private_lock);
164 getdref_locked(b, REF);
165 spin_unlock(&ino->i_data.private_lock);
169 page_cache_release(p);
171 LAFS_BUG(b->b.inode != ino, &b->b);
175 /* When a page is truncated, either because the file is being
176 * truncated or because the page is being removed from the
177 * mapping, invalidate_page is called to clean up any
179 * If (and only if) offset == 0, we should discard the ->private
180 * content and clear the PagePrivate flag. This is done by calling
181 * try_to_release_page which calls our lafs_release_page (if there
182 * is no pending writeback).
184 * If any blocks are beyond the end of the (i_size), they should
188 void lafs_invalidate_page(struct page *page, unsigned long offset)
190 struct inode *ino = page->mapping->host;
191 struct super_block *sb = ino->i_sb;
192 int bits = PAGE_SHIFT - sb->s_blocksize_bits;
193 loff_t size = i_size_read(ino);
194 loff_t start = (loff_t)page_index(page) << PAGE_SHIFT;
196 if (PagePrivate(page)) {
199 struct datablock *b = (struct datablock *)page->private;
202 * erase any blocks beyond end-of-file
203 * wait for any pending IO to complete (so page can be freed)
205 for (i = 0; i < (1<<bits); i++) {
206 spin_lock(&ino->i_data.private_lock);
207 (void)getdref_locked(&b[i], MKREF(inval));
208 spin_unlock(&ino->i_data.private_lock);
210 if (b_start >= offset &&
211 test_and_clear_bit(B_Async, &b[i].b.flags))
212 putdref(&b[i], MKREF(Async));
214 if (LAFSI(ino)->type >= TypeBase && start >= size)
215 /* Remove block from mapping and file */
216 lafs_erase_dblock(&b[i]);
217 else if (b_start >= offset) {
218 /* Just remove block from mapping */
219 lafs_iolock_written(&b[i].b);
220 LAFS_BUG(test_bit(B_Dirty, &b[i].b.flags),
222 LAFS_BUG(test_bit(B_Realloc, &b[i].b.flags),
224 clear_bit(B_Valid, &b[i].b.flags);
225 lafs_iounlock_block(&b[i].b);
227 b_start += sb->s_blocksize;
228 start += sb->s_blocksize;
229 LAFS_BUG(offset == 0 &&
230 test_bit(B_IOLock, &b[i].b.flags),
232 putdref(&b[i], MKREF(inval));
236 int success = try_to_release_page(page, 0);
241 int lafs_release_page(struct page *page, gfp_t gfp_flags)
243 struct address_space * const mapping = page->mapping;
244 int bits = PAGE_SHIFT - mapping->host->i_blkbits;
247 struct indexblock *parents[1<<bits];
248 struct datablock *b = NULL;
250 if (PageWriteback(page)) {
251 BUG(); /* testing - remove this later */
255 spin_lock(&mapping->private_lock);
256 if (!PagePrivate(page)) {
257 spin_unlock(&mapping->private_lock);
261 /* based on try_to_free_buffers, we need to
262 * - pass any write errors back up to page
263 * - mark the page clean if the buffers are all clean
264 * - fail if any buffers are busy (pinned, or dirty)
265 * - free the data structures
267 b = (struct datablock *)page->private;
268 for (i = 0; i < (1<<bits); i++) {
269 if (test_bit(B_WriteError, &b[i].b.flags))
270 set_bit(AS_EIO, &mapping->flags);
271 if (test_bit(B_Dirty, &b[i].b.flags) ||
272 test_bit(B_Pinned, &b[i].b.flags) ||
273 test_bit(B_IOLock, &b[i].b.flags) ||
274 test_bit(B_Writeback, &b[i].b.flags)
275 /* NOTE: if we find an Uninc is set when we
276 * need to invalidate the page, then we
277 * should be waiting for all pages to be gone
278 * properly before allowing truncate to complete.
279 * The whole file doesn't need to be truncated yet,
280 * that can continue lazily. but all the pages must
281 * be incorporated. Maybe we just need to
282 * wait for a checkpoint here.??
284 || test_bit(B_Uninc, &b[i].b.flags)
286 spin_unlock(&mapping->private_lock);
287 /* This can happen in various valid situations
288 * such as when cleaning a file that is only
289 * read-only to the VM so the VM feels free
290 * to try to release it.
295 /* OK, we are good to go. */
296 for (i = 0; i < (1<<bits); i++) {
297 parents[i] = b[i].b.parent;
298 b[i].b.parent = NULL;
299 list_del_init(&b[i].b.siblings);
300 list_del_init(&b[i].b.lru);
301 list_del_init(&b[i].b.peers);
302 (void)getdref_locked(&b[i], MKREF(lafs_release));
303 if (test_and_clear_bit(B_Credit, &b[i].b.flags))
305 if (test_and_clear_bit(B_ICredit, &b[i].b.flags))
307 if (test_and_clear_bit(B_NCredit, &b[i].b.flags))
309 if (test_and_clear_bit(B_NICredit, &b[i].b.flags))
311 /* When !PagePrivate(page), && refcnt, we hold a ref on the
312 * first block which holds a ref on the page.
313 * When ref on firstblock with !PagePrivate(page) becomes zero,
317 getdref_locked(&b[0], MKREF(lafs_release_0));
323 ClearPagePrivate(page);
325 spin_unlock(&mapping->private_lock);
326 lafs_space_return(fs_from_inode(mapping->host), credits);
328 for (i = 0; i < (1<<bits); i++) {
329 putdref(&b[i], MKREF(lafs_release));
330 putiref(parents[i], MKREF(child));
336 /* Pinning and dirtying of datablocks.
337 * Before a modification of a datablock can be allowed we must be sure there
338 * will be room to write it out. Thus suitable pre-allocations are required.
339 * There are two general cases to consider.
340 * In one case we are building an internal transaction such as a directory
341 * update. In this case we need to pre-allocated for all blocks that might
342 * be updated and if those preallocations succeed, we make the update
343 * and mark the blocks as dirty. They are also 'pinned' and will be written
344 * as part of the current phase.
345 * In the other case we are simply writing a single block for user-space.
346 * In this case the preallocation is still required, but the block is not
347 * pinned to the phase and so may be written out at any time.
348 * We have a series of functions that help manage this. They are:
350 * This ensures that all parents are loaded in memory and ref-counted
351 * by the target block.
353 * This takes a block with parents and attempts to reserve space for writing
354 * out all of the parents. and the block. This may block or fail if space
357 * This takes a reserved block and pins it to the current phase.
359 * This takes a reserved (possibly pinned) block and marks it dirty.
362 /* pinning a block ensures that we can write to it.
363 * If the block does not already have an allocation
364 * (i.e. a physical address) then we are allowed to
365 * fail -ENOSPC. Otherwise we can at most wait
367 * The update may not occur for a long time, so
368 * the reservations must be preserved across
369 * multiple checkpoints (unlikely but possible).
370 * So each pinning is counted and reserves whatever
371 * is required. When a pin is released the reserved
372 * space is given to the block if it needs it, or
373 * is returned to the filesystem.
377 lafs_reserve_block(struct block *b, int alloc_type)
380 struct fs *fs = fs_from_inode(b->inode);
383 if (!test_bit(B_PhysValid, &b->flags))
386 if (test_bit(B_Index, &b->flags))
387 LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags),
390 err = lafs_setparent(dblk(b));
394 /* If there is already a physaddr, or the data is
395 * stored in the inode, then we aren't really allocating
397 * When unlinking from a small directory, this can
400 if (alloc_type == NewSpace &&
401 (b->physaddr || (b->fileaddr == 0
402 && LAFSI(b->inode)->depth == 0)))
403 alloc_type = ReleaseSpace;
405 if (alloc_type == NewSpace && test_bit(B_InoIdx, &b->flags))
406 /* physaddr isn't necessarily set for the InoIdx block.
408 alloc_type = ReleaseSpace;
410 /* Important to test EmergencyClean before we
411 * called in to lafs_space_alloc to avoid races:
412 * space becomes available and EmergencyClean are
413 * set at the same time (strange, but true).
415 in_emergency = test_bit(EmergencyClean, &fs->fsstate);
416 /* Allocate space in the filesystem */
417 err = lafs_prealloc(b, alloc_type);
419 if (alloc_type == NewSpace) {
424 if (alloc_type == ReleaseSpace)
429 /* Allocate space in the file (and quota set) */
430 if (b->physaddr == 0 &&
431 !test_bit(B_Index, &b->flags) &&
432 !test_and_set_bit(B_Prealloc, &b->flags)) {
433 err = lafs_summary_allocate(fs, b->inode, 1);
435 clear_bit(B_Prealloc, &b->flags);
438 LAFS_BUG(alloc_type == AccountSpace, b);
442 /* Having reserved the block, we need to get a segref,
443 * which will involve reserving those blocks too.
444 * However we never get a segref for Root or any
448 if (test_bit(B_InoIdx, &b->flags))
449 b = &LAFSI(b->inode)->dblock->b;
452 && !test_bit(B_Root, &b->flags)
453 && !test_bit(B_SegRef, &b->flags))
454 err = lafs_seg_ref_block(b, 0);
460 lafs_pin_dblock(struct datablock *b, int alloc_type)
463 * - pin parents and inode
464 * - preallocate as needed
465 * - reference the old segment
466 * - update flags and pointers.
468 /* FIXME I probably need an iolock here to avoid racing with
469 * lafs_cluster_allocate which can clear dirty and so lose credits.
472 struct fs *fs = fs_from_inode(b->b.inode);
474 LAFS_BUG(!test_bit(B_PinPending, &b->b.flags), &b->b);
475 if (LAFSI(b->b.inode)->type != TypeSegmentMap) {
476 LAFS_BUG(!test_phase_locked(fs), &b->b);
477 lafs_iolock_written(&b->b);
478 /* If this block is already pinned in the previous
479 * phase, now it a good time to flip it - we know it has
480 * been written and we want to flip it before it
483 if (test_bit(B_Pinned, &b->b.flags) &&
484 !!test_bit(B_Phase1, &b->b.flags) != fs->phase) {
485 clear_bit(B_PinPending, &b->b.flags);
486 lafs_refile(&b->b, 0);
487 set_bit(B_PinPending, &b->b.flags);
489 lafs_iounlock_block(&b->b);
492 err = lafs_reserve_block(&b->b, alloc_type);
497 lafs_pin_block(&b->b);
502 * This cannot fail. The block is already 'pinned' for writing
503 * so any preallocations and other checks have passed.
506 lafs_dirty_dblock(struct datablock *b)
508 LAFS_BUG(!test_bit(B_Valid, &b->b.flags), &b->b);
510 * FIXME maybe check we aren't dirtying a dirty block
511 * in the previous phase.
513 // LAFS_BUG(b->b.inode->i_ino == 0 && !test_bit(B_Pinned, &b->b.flags), &b->b);
514 if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
515 if (!test_and_clear_bit(B_Credit, &b->b.flags))
516 if (!test_and_clear_bit(B_NCredit, &b->b.flags))
517 LAFS_BUG(1, &b->b); // Credit should have been set.
518 __set_page_dirty_nobuffers(b->page);
520 if (!test_and_set_bit(B_UnincCredit, &b->b.flags))
521 if (!test_and_clear_bit(B_ICredit, &b->b.flags))
522 if (!test_and_clear_bit(B_NICredit, &b->b.flags))
523 LAFS_BUG(1, &b->b); /* ICredit should be set before we dirty
528 erase_dblock_locked(struct datablock *b)
530 struct fs *fs = fs_from_inode(b->b.inode);
532 dprintk("Eraseblock for %s\n", strblk(&b->b));
533 if (b->b.physaddr == 0 &&
534 b->b.fileaddr == 0 &&
535 LAFSI(b->b.inode)->depth == 0) {
536 /* We need to clear out the index block that this
538 * Need private_lock to be allowed to dereference ->iblock
539 * though if b was dirty we shouldn't.... FIXME.
540 * We need to hold the ref to idb for the getiref_locked_needsync to
543 struct indexblock *ib;
544 struct datablock *idb = lafs_inode_dblock(b->b.inode, SYNC, MKREF(erase));
546 /* not much we can do here */
550 spin_lock(&lafs_hash_lock);
551 ib = LAFSI(b->b.inode)->iblock;
553 getiref_locked_needsync(ib, MKREF(erasedblock));
554 spin_unlock(&lafs_hash_lock);
556 putdref(idb, MKREF(erase));
558 lafs_iolock_written(&ib->b);
559 if (ib->depth == 0) {
560 LAFS_BUG(LAFSI(b->b.inode)->depth !=
563 LAFSI(b->b.inode)->depth = 1;
564 lafs_clear_index(ib);
565 clear_bit(B_PhysValid, &b->b.flags);
566 clear_bit(B_SegRef, &b->b.flags); /* Just in case */
568 lafs_iounlock_block(&ib->b);
569 putiref(ib, MKREF(erasedblock));
574 if (LAFSI(b->b.inode)->type == TypeInodeFile) {
575 struct inode *ino = rcu_my_inode(b);
576 if (ino && LAFSI(ino)->iblock)
577 LAFS_BUG(LAFSI(ino)->iblock->depth > 1,
582 clear_bit(B_Valid, &b->b.flags);
584 if (test_and_clear_bit(B_Dirty, &b->b.flags))
585 lafs_space_return(fs, 1);
586 if (test_and_clear_bit(B_Realloc, &b->b.flags))
587 lafs_space_return(fs, 1);
588 if (test_and_clear_bit(B_Prealloc, &b->b.flags))
589 if (b->b.physaddr == 0)
590 lafs_summary_allocate(fs, b->b.inode, -1);
592 spin_lock(&fs->lock);
593 if (test_bit(B_Pinned, &b->b.flags)) {
594 /* When erasing a pinned dblock it will usually be on a
595 * leaf list, so we must remove it.
596 * However it is IOLocked so it might not be on the leaf list.
598 LAFS_BUG(test_bit(B_Writeback, &b->b.flags), &b->b);
599 if (!list_empty(&b->b.lru)) {
600 list_del_init(&b->b.lru);
602 if (!test_bit(B_Root, &b->b.flags))
603 atomic_dec(&b->b.parent->pincnt
604 [!!test_bit(B_Phase1, &b->b.flags)]);
605 clear_bit(B_Pinned, &b->b.flags);
606 spin_unlock(&fs->lock);
607 if (!test_bit(B_Root, &b->b.flags))
608 lafs_refile(&b->b.parent->b, 0);
610 spin_unlock(&fs->lock);
612 /* we set Writeback to validate the call to lafs_allocated block */
613 set_bit(B_Writeback, &b->b.flags);
614 lafs_iounlock_block(&b->b);
615 if (b->b.parent == NULL)
616 /* Erasing a block that isn't in the indexing tree only
617 * happens when truncating and lafs_invalidate_page is called
618 * on some clean page.
619 * So we don't clear out physaddr here, but instead leave that
620 * to the core truncate code.
621 * Just remove B_PhysValid to avoid confusion.
623 clear_bit(B_PhysValid, &b->b.flags);
624 else if (b->b.physaddr)
625 lafs_allocated_block(fs, &b->b, 0);
627 if (test_and_clear_bit(B_UnincCredit, &b->b.flags))
628 lafs_space_return(fs, 1);
629 lafs_writeback_done(&b->b);
633 lafs_erase_dblock(struct datablock *b)
635 lafs_iolock_written(&b->b);
636 erase_dblock_locked(b);
640 lafs_erase_dblock_async(struct datablock *b)
643 rv = lafs_iolock_written_async(&b->b);
645 erase_dblock_locked(b);
650 lafs_dirty_iblock(struct indexblock *b, int want_realloc)
652 /* Note, only need to set the phase if locked.
653 * Then no-one may change it while in phase transition.
654 * FIXME maybe check we aren't dirtying a dirty block
655 * in the previous phase.
658 LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
659 LAFS_BUG(!test_bit(B_Valid, &b->b.flags) && b->depth > 0, &b->b);
662 /* Try to make for Realloc instead. If we cannot get the
663 * credits, fall back on Dirty
665 struct fs *fs = fs_from_inode(b->b.inode);
666 if (!test_bit(B_Realloc, &b->b.flags)) {
667 /* I cannot use B_Credit to fill B_Realloc as that
668 * might still be needed for B_Dirty.
669 * So if we cannot allocated a new credit,
670 * just set the block as 'dirty' now.
672 if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
673 if (test_and_set_bit(B_Realloc, &b->b.flags))
674 lafs_space_return(fs, 1);
678 if (!test_bit(B_UnincCredit, &b->b.flags)) {
679 /* Ditto for UnincCredit */
680 if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
681 if (test_and_set_bit(B_UnincCredit, &b->b.flags))
682 lafs_space_return(fs, 1);
689 if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
690 if (!test_and_clear_bit(B_Credit, &b->b.flags)) {
691 printk(KERN_ERR "Why have I no credits?\n");
692 LAFS_BUG(1, &b->b); // Credit should have been set.
696 /* Iblocks don't always have ICredits. If they have room
697 * for 3 new addresses, the ICredit is not essential. But
700 if (!test_bit(B_UnincCredit, &b->b.flags))
701 /* We would like a credit */
702 if (test_and_clear_bit(B_ICredit, &b->b.flags))
703 /* We have a credit */
704 if (test_and_set_bit(B_UnincCredit, &b->b.flags))
705 /* race - we didn't need it after all */
706 lafs_space_return(fs_from_inode(b->b.inode), 1);