3 * Handle data blocks for LaFS
5 * Copyright (C) 2004-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
11 #include <linux/buffer_head.h> /* for try_to_release_page */
13 * Data blocks are stored in a regular address space for the
14 * relevant file. A page may have multiple datablocks, but
15 * a datablock cannot extend beyond one page
16 * Each datablock has (or can have) a 'struct datablock' attribute
17 * structure. These are allocated as an array per page, and attached
18 * at the ->private pointer on the page.
23 static DEFINE_SPINLOCK(refl);
24 void add_ref(struct block *b, char *ref, char *file, int line)
29 for (i = 0; i < 16; i++) {
30 if (b->holders[i].cnt) {
31 if (strcmp(b->holders[i].name, ref) == 0) {
41 printk(KERN_ERR "LaFS: add_ref all holders are in use"
42 " at %s:%d\nblock: %s\n",
43 file, line, strblk(b));
46 b->holders[z].cnt = 1;
47 b->holders[z].name = ref;
51 void del_ref(struct block *b, char *ref, char *file, int line)
55 for (i = 0; i < 16; i++) {
56 if (b->holders[i].cnt &&
57 strcmp(b->holders[i].name, ref) == 0) {
65 printk(KERN_ERR "LaFS: holder %s not found at %s:%d\nblk: %s\n",
66 ref, file, line, strblk(b));
71 /* Based on grow_dev_page */
73 lafs_get_block(struct inode *ino, unsigned long index, struct page *p,
76 struct datablock *b = NULL;
77 int bits = PAGE_SHIFT - ino->i_blkbits;
80 p = find_get_page(&ino->i_data, index >> bits);
82 spin_lock(&ino->i_data.private_lock);
84 b = (struct datablock *)p->private;
85 b += index & ((1<<bits)-1);
86 getdref_locked(b, REF);
88 spin_unlock(&ino->i_data.private_lock);
89 page_cache_release(p);
96 p = find_or_create_page(&ino->i_data, index>>bits, gfp);
98 dprintk("find or create returned NULL\n");
102 if (!PagePrivate(p)) {
104 unsigned long ind = p->index << bits;
106 /* New page, need to set up attribute blocks */
107 /* FIXME use kmem_cache */
108 dprintk("setting up %p for %lu\n", p, index);
109 b = kzalloc(sizeof(struct datablock)<<bits, gfp);
113 page_cache_release(p);
117 for (i = 0; i < (1<<bits); i++) {
119 atomic_set(&b[i].b.refcnt, 0);
121 b[i].b.fileaddr = ind++;
124 b[i].b.parent = NULL;
125 INIT_LIST_HEAD(&b[i].b.siblings);
126 INIT_LIST_HEAD(&b[i].b.lru);
127 INIT_LIST_HEAD(&b[i].b.peers);
128 INIT_LIST_HEAD(&b[i].orphans);
129 INIT_LIST_HEAD(&b[i].cleaning);
132 b[i].my_inode = NULL;
134 /* FIXME what else needs to be initialised? */
137 spin_lock(&ino->i_data.private_lock);
138 if (!PagePrivate(p)) {
139 p->private = (unsigned long) b;
143 spin_unlock(&ino->i_data.private_lock);
147 b = (struct datablock *)p->private;
148 b += index & ((1<<bits)-1);
149 getdref_locked(b, REF);
153 page_cache_release(p);
155 LAFS_BUG(b->b.inode != ino, &b->b);
159 /* When a page is truncated, either because the file is being
160 * truncated or because the page is being removed from the
161 * mapping, invalidate_page is called to clean up any
163 * If (and only if) offset == 0, we should discard the ->private
164 * content and clear the PagePrivate flag. This is done by calling
165 * try_to_release_page which calls our lafs_release_page (if there
166 * is no pending writeback).
168 * If any blocks are beyond the end of the (i_size), they should
172 void lafs_invalidate_page(struct page *page, unsigned long offset)
174 struct inode *ino = page->mapping->host;
175 struct super_block *sb = ino->i_sb;
176 int bits = PAGE_SHIFT - sb->s_blocksize_bits;
177 loff_t size = i_size_read(ino);
178 loff_t start = (loff_t)page_index(page) << PAGE_SHIFT;
180 if (PagePrivate(page)) {
183 struct datablock *b = (struct datablock *)page->private;
186 * erase any blocks beyond end-of-file
187 * wait for any pending IO to complete (so page can be freed)
189 for (i = 0; i < (1<<bits); i++) {
190 if (LAFSI(ino)->type >= TypeBase && start >= size)
191 /* Remove block from mapping and file */
192 lafs_erase_dblock(&b[i]);
193 else if (b_start >= offset) {
194 /* Just remove block from mapping */
195 lafs_iolock_written(&b[i].b);
196 LAFS_BUG(test_bit(B_Dirty, &b[i].b.flags),
198 LAFS_BUG(test_bit(B_Realloc, &b[i].b.flags),
200 clear_bit(B_Valid, &b[i].b.flags);
201 lafs_iounlock_block(&b[i].b);
203 b_start += sb->s_blocksize;
204 start += sb->s_blocksize;
205 LAFS_BUG(offset == 0 &&
206 test_bit(B_IOLock, &b[i].b.flags),
211 int success = try_to_release_page(page, 0);
216 int lafs_release_page(struct page *page, gfp_t gfp_flags)
218 struct address_space * const mapping = page->mapping;
219 int bits = PAGE_SHIFT - mapping->host->i_blkbits;
222 struct indexblock *parents[1<<bits];
223 struct datablock *b = NULL;
225 if (PageWriteback(page)) {
226 BUG(); /* testing - remove this later */
230 spin_lock(&mapping->private_lock);
231 if (!PagePrivate(page)) {
232 spin_unlock(&mapping->private_lock);
236 /* based on try_to_free_buffers, we need to
237 * - pass any write errors back up to page
238 * - mark the page clean if the buffers are all clean
239 * - fail if any buffers are busy (pinned, or dirty)
240 * - free the data structures
242 b = (struct datablock *)page->private;
243 for (i = 0; i < (1<<bits); i++) {
244 if (test_bit(B_WriteError, &b[i].b.flags))
245 set_bit(AS_EIO, &mapping->flags);
246 if (test_bit(B_Dirty, &b[i].b.flags) ||
247 test_bit(B_Pinned, &b[i].b.flags) ||
248 test_bit(B_IOLock, &b[i].b.flags) ||
249 test_bit(B_Writeback, &b[i].b.flags)
250 /* NOTE: if we find an Uninc is set when we
251 * need to invalidate the page, then we
252 * should be waiting for all pages to be gone
253 * properly before allowing truncate to complete.
254 * The whole file doesn't need to be truncated yet,
255 * that can continue lazily. but all the pages must
256 * be incorporated. Maybe we just need to
257 * wait for a checkpoint here.??
259 || test_bit(B_Uninc, &b[i].b.flags)
261 spin_unlock(&mapping->private_lock);
262 LAFS_BUG(1, &b[i].b);
263 /* This not really a bug, but bugs can lead
264 * here, and this is an unusual situation
265 * (currently) so we BUG here to be safe.
266 * When we find a situation that does fail a
267 * release_page with good reason, we should
273 /* OK, we are good to go. */
274 for (i = 0; i < (1<<bits); i++) {
275 parents[i] = b[i].b.parent;
276 b[i].b.parent = NULL;
277 list_del_init(&b[i].b.siblings);
278 list_del_init(&b[i].b.lru);
279 list_del_init(&b[i].b.peers);
280 (void)getdref_locked(&b[i], MKREF(lafs_release));
281 if (test_and_clear_bit(B_Credit, &b[i].b.flags))
283 if (test_and_clear_bit(B_ICredit, &b[i].b.flags))
285 if (test_and_clear_bit(B_NCredit, &b[i].b.flags))
287 if (test_and_clear_bit(B_NICredit, &b[i].b.flags))
289 /* When !PagePrivate(page), && refcnt, we hold a ref on the
290 * first block which holds a ref on the page.
291 * When ref on firstblock with !PagePrivate(page) becomes zero,
295 getdref_locked(&b[0], MKREF(lafs_release_0));
301 ClearPagePrivate(page);
303 spin_unlock(&mapping->private_lock);
304 lafs_space_return(fs_from_inode(mapping->host), credits);
306 for (i = 0; i < (1<<bits); i++) {
307 putdref(&b[i], MKREF(lafs_release));
308 putiref(parents[i], MKREF(child));
314 /* Pinning and dirtying of datablocks.
315 * Before a modification of a datablock can be allowed we must be sure there
316 * will be room to write it out. Thus suitable pre-allocations are required.
317 * There are two general cases to consider.
318 * In one case we are building an internal transaction such as a directory
319 * update. In this case we need to pre-allocated for all blocks that might
320 * be updated and if those preallocations succeed, we make the update
321 * and mark the blocks as dirty. They are also 'pinned' and will be written
322 * as part of the current phase.
323 * In the other case we are simply writing a single block for user-space.
324 * In this case the preallocation is still required, but the block is not
325 * pinned to the phase and so may be written out at any time.
326 * We have a series of functions that help manage this. They are:
328 * This ensures that all parents are loaded in memory and ref-counted
329 * by the target block.
331 * This takes a block with parents and attempts to reserve space for writing
332 * out all of the parents. and the block. This may block or fail if space
335 * This takes a reserved block and pins it to the current phase.
337 * This takes a reserved (possibly pinned) block and marks it dirty.
340 /* pinning a block ensures that we can write to it.
341 * If the block does not already have an allocation
342 * (i.e. a physical address) then we are allowed to
343 * fail -ENOSPC. Otherwise we can at most wait
345 * The update may not occur for a long time, so
346 * the reservations must be preserved across
347 * multiple checkpoints (unlikely but possible).
348 * So each pinning is counted and reserves whatever
349 * is required. When a pin is released the reserved
350 * space is given to the block if it needs it, or
351 * is returned to the filesystem.
355 lafs_reserve_block(struct block *b, int alloc_type)
358 struct fs *fs = fs_from_inode(b->inode);
360 if (!test_bit(B_PhysValid, &b->flags))
363 if (test_bit(B_Index, &b->flags))
364 LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags),
367 err = lafs_setparent(dblk(b));
369 /* If there is already a physaddr, or the data is
370 * stored in the inode, then we aren't really allocating
372 * When unlinking from a small directory, this can
375 if (alloc_type == NewSpace &&
376 (b->physaddr || (b->fileaddr == 0
377 && LAFSI(b->inode)->depth == 0)))
378 alloc_type = ReleaseSpace;
380 if (alloc_type == NewSpace && test_bit(B_InoIdx, &b->flags))
381 /* physaddr isn't necessarily set for the InoIdx block.
383 alloc_type = ReleaseSpace;
385 /* Allocate space in the filesystem */
386 err = err ?: lafs_prealloc(b, alloc_type);
388 /* Allocate space in the file (and quota set) */
389 if (err == 0 && b->physaddr == 0 &&
390 !test_bit(B_Index, &b->flags) &&
391 !test_and_set_bit(B_Prealloc, &b->flags)) {
392 err = lafs_summary_allocate(fs, b->inode, 1);
394 clear_bit(B_Prealloc, &b->flags);
397 /* Having reserved the block, we need to get a segref,
398 * which will involve reserving those blocks too.
399 * However we never get a segref for Root.
403 && !test_bit(B_Root, &b->flags)
404 && !test_bit(B_SegRef, &b->flags))
405 err = lafs_seg_ref_block(b, 0);
409 /* FIXME maybe CleanSpace should return -EAGAIN if there
410 * is a good chance that the cleaner will help out soon??
411 * I wonder how "soon" can be defined.
413 if (alloc_type == CleanSpace || alloc_type == NewSpace)
415 if (alloc_type == ReleaseSpace)
421 lafs_pin_dblock(struct datablock *b, int alloc_type)
424 * - pin parents and inode
425 * - preallocate as needed
426 * - reference the old segment
427 * - update flags and pointers.
429 /* FIXME I probably need an iolock here to avoid racing with
430 * lafs_cluster_allocate which can clear dirty and so lose credits.
433 struct fs *fs = fs_from_inode(b->b.inode);
436 /* We don't pin a datablock of an inode if there is an
437 * InoIdx block. We pin the InoIdx block instead.
438 * They might both be pinned at the same time, but
439 * only when the index block has swapped phase and the
440 * data block is waiting to be written.
442 if (LAFSI(b->b.inode)->type == TypeInodeFile &&
444 LAFSI(b->my_inode)->iblock) {
445 struct indexblock *ib = lafs_make_iblock(b->my_inode, ADOPT, SYNC,
448 blk = getref(&b->b, MKREF(pindb));
452 blk = getref(&b->b, MKREF(pindb));
454 LAFS_BUG(!test_phase_locked(fs), &b->b);
456 lafs_phase_wait(blk);
458 set_bit(B_PinPending, &b->b.flags);
459 err = lafs_reserve_block(blk, alloc_type);
462 clear_bit(B_PinPending, &b->b.flags);
463 putref(blk, MKREF(pindb));
468 putref(blk, MKREF(pindb));
473 * This cannot fail. The block is already 'pinned' for writing
474 * so any preallocations and other checks have passed.
477 lafs_dirty_dblock(struct datablock *b)
479 LAFS_BUG(!test_bit(B_Valid, &b->b.flags), &b->b);
480 if (test_bit(B_Pinned, &b->b.flags))
481 b->b.inode->i_sb->s_dirt = 1;
483 * FIXME maybe check we aren't dirtying a dirty block
484 * in the previous phase.
487 if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
488 if (!test_and_clear_bit(B_Credit, &b->b.flags))
489 if (!test_and_clear_bit(B_NCredit, &b->b.flags))
490 LAFS_BUG(1, &b->b); // Credit should have been set.
491 __set_page_dirty_nobuffers(b->page);
493 if (!test_and_set_bit(B_UnincCredit, &b->b.flags))
494 if (!test_and_clear_bit(B_ICredit, &b->b.flags))
495 if (!test_and_clear_bit(B_NICredit, &b->b.flags))
496 LAFS_BUG(1, &b->b); // ICredit should be set before we dirty
499 // FIXME Do I need to do something with PinPending??
504 lafs_erase_dblock(struct datablock *b)
506 struct fs *fs = fs_from_inode(b->b.inode);
508 dprintk("Eraseblock for %s\n", strblk(&b->b));
509 lafs_iolock_written(&b->b);
510 if (b->b.physaddr == 0 &&
511 b->b.fileaddr == 0 &&
512 LAFSI(b->b.inode)->depth == 0) {
513 /* We need to clear out the index block that this
515 * Need private_lock to be allowed to dereference ->iblock
516 * though if b was dirty we shouldn't.... FIXME.
518 struct indexblock *ib;
519 spin_lock(&b->b.inode->i_data.private_lock);
520 ib = LAFSI(b->b.inode)->iblock;
522 getiref_locked(ib, MKREF("erasedblock"));
523 spin_unlock(&b->b.inode->i_data.private_lock);
525 lafs_iolock_written(&ib->b);
526 if (ib->depth == 0) {
527 LAFS_BUG(LAFSI(b->b.inode)->depth !=
529 lafs_clear_index(ib);
530 clear_bit(B_PhysValid, &b->b.flags);
532 lafs_iounlock_block(&ib->b);
533 putiref(ib, MKREF("erasedblock"));
537 if (LAFSI(b->b.inode)->type == TypeInodeFile &&
538 LAFSI(b->my_inode)->iblock)
539 LAFS_BUG(LAFSI(b->my_inode)->iblock->depth > 1,
542 clear_bit(B_Valid, &b->b.flags);
544 if (test_and_clear_bit(B_Dirty, &b->b.flags))
545 lafs_space_return(fs, 1);
546 if (test_and_clear_bit(B_Realloc, &b->b.flags))
547 lafs_space_return(fs, 1);
548 if (test_and_clear_bit(B_Prealloc, &b->b.flags))
549 if (b->b.physaddr == 0)
550 lafs_summary_allocate(fs, b->b.inode, -1);
552 spin_lock(&fs->lock);
553 if (test_bit(B_Pinned, &b->b.flags)) {
554 /* When erasing a pinned dblock it will usually be on a
555 * leaf list, so we must remove it.
556 * However it is IOLocked so it might not be on the leaf list.
559 LAFS_BUG(test_bit(B_Writeback, &b->b.flags), &b->b);
560 if (!list_empty(&b->b.lru)) {
562 list_del_init(&b->b.lru);
564 if (!test_bit(B_Root, &b->b.flags))
565 atomic_dec(&b->b.parent->pincnt
566 [!!test_bit(B_Phase1, &b->b.flags)]);
567 clear_bit(B_Pinned, &b->b.flags);
568 spin_unlock(&fs->lock);
569 if (!test_bit(B_Root, &b->b.flags))
570 lafs_refile(&b->b.parent->b, 0);
572 putiref(b, MKREF(leaf));
574 spin_unlock(&fs->lock);
576 /* we set Writeback to validate the call to lafs_allocated block */
577 set_bit(B_Writeback, &b->b.flags);
578 lafs_iounlock_block(&b->b);
579 if (b->b.parent == NULL)
580 /* Erasing a block that isn't in the indexing tree only
581 * happens when truncating and lafs_invalidate_page is called
582 * on some clean page.
583 * So we don't clear out physaddr here, but instead leave that
584 * to the core truncate code.
585 * Just remove B_PhysValid to avoid confusion.
587 clear_bit(B_PhysValid, &b->b.flags);
588 else if (b->b.physaddr)
589 lafs_allocated_block(fs, &b->b, 0);
591 if (test_and_clear_bit(B_UnincCredit, &b->b.flags))
592 lafs_space_return(fs, 1);
593 lafs_writeback_done(&b->b);
597 lafs_dirty_iblock(struct indexblock *b)
600 /* FIXME is this all I have to do here?
601 * Do I need to put it on a list, or lock or something?
603 * Note, only need to set that phase if locked.
604 * Then no-one may change it while in phase transition.
605 * FIXME maybe check we aren't dirtying a dirty block
606 * in the previous phase.
609 LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
610 if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
611 if (!test_and_clear_bit(B_Credit, &b->b.flags)) {
612 printk(KERN_ERR "Why have I no credits?\n");
613 LAFS_BUG(1, &b->b); // Credit should have been set.
617 /* Iblocks don't always have ICredits. If they have room
618 * for 3 new addresses, the ICredit is not essential. But
621 if (!test_bit(B_UnincCredit, &b->b.flags))
622 /* We would like a credit */
623 if (test_and_clear_bit(B_ICredit, &b->b.flags))
624 /* We have a credit */
625 if (test_and_set_bit(B_UnincCredit, &b->b.flags))
626 /* race - we didn't need it after all */
627 lafs_space_return(fs_from_inode(b->b.inode), 1);
629 b->b.inode->i_sb->s_dirt = 1;