5 * Copyright (C) 2006-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
11 * There are quite separate sets of routines here.
12 * One set is used for reading and writing filesystem blocks.
13 * Reading is generally asynchronous, but can be waited for.
14 * Writing is sequential into write-clusters. It is not possible
15 * to wait for a particular write, but only to wait for a write-cluster
17 * The other set is for all other IO such as reading/writing superblocks
18 * and stateblocks, and for reading cluster-heads during roll-forward.
19 * These reads are always synchronous while write allow all devices
20 * to be written in parallel.
24 #include <linux/blkdev.h>
25 #include <linux/bit_spinlock.h>
28 lafs_dev_find(struct fs *fs, u64 virt)
31 for (i = 0; i < fs->devices; i++)
32 if (virt >= fs->devs[i].start &&
33 virt < fs->devs[i].start + fs->devs[i].size)
35 printk("%llu not found:\n", (unsigned long long) virt);
36 for (i = 0; i < fs->devices; i++)
37 printk(" %d: %llu+%llu\n", i,
38 (unsigned long long)fs->devs[i].start,
39 (unsigned long long)fs->devs[i].size);
44 static void bi_complete(struct bio *bio, int error)
46 complete((struct completion *)bio->bi_private);
50 lafs_sync_page_io(struct block_device *bdev, sector_t sector,
52 struct page *page, int rw)
54 struct bio *bio = bio_alloc(GFP_NOIO, 1);
55 struct completion event;
61 bio->bi_sector = sector;
62 bio_add_page(bio, page, size, offset);
63 init_completion(&event);
64 bio->bi_private = &event;
65 bio->bi_end_io = bi_complete;
67 wait_for_completion(&event);
69 ret = !!test_bit(BIO_UPTODATE, &bio->bi_flags);
75 lafs_load_page(struct fs *fs, struct page *p, u64 vaddr, int blocks)
79 struct block_device *bdev;
81 virttophys(fs, vaddr, &dev, §);
83 if (dev < 0 || dev >= fs->devs_loaded) {
84 dprintk("dev %d not in [0..%d)\n", dev, fs->devs_loaded);
88 bdev = fs->devs[dev].bdev;
89 return lafs_sync_page_io(bdev, sect, 0,
90 blocks << fs->blocksize_bits,
95 lafs_load_pages(struct fs *fs, struct page *p, u64 vaddr, int blocks)
97 /* load 1 or more pages which are consecutive in memory
99 * FIXME make this async - then wait.
101 int blocks_per_page = (PAGE_SIZE >> fs->blocksize_bits);
104 while(blocks && rv == 0) {
106 if (b > blocks_per_page)
108 rv = lafs_load_page(fs, p, vaddr, b);
110 vaddr += blocks_per_page;
117 bi_async_complete(struct bio *bio, int error)
119 struct async_complete *ac = bio->bi_private;
121 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
126 lafs_wake_thread(ac->fs);
130 async_page_io(struct block_device *bdev, sector_t sector, int offset, int size,
131 struct page *page, int rw, struct async_complete *ac)
133 struct bio *bio = bio_alloc(GFP_NOIO, 1);
138 bio->bi_sector = sector;
139 bio_add_page(bio, page, size, offset);
140 bio->bi_private = ac;
141 bio->bi_end_io = bi_async_complete;
146 lafs_load_page_async(struct fs *fs, struct page *p, u64 vaddr,
147 int blocks, struct async_complete *ac)
151 struct block_device *bdev;
153 virttophys(fs, vaddr, &dev, §);
155 if (dev < 0 || dev >= fs->devs_loaded) {
156 dprintk("dev %d not in [0..%d)\n", dev, fs->devs_loaded);
166 bdev = fs->devs[dev].bdev;
167 ac->state = 2; /* loading */
169 async_page_io(bdev, sect, 0,
170 blocks << fs->blocksize_bits,
176 bi_write_done(struct bio *bio, int error)
178 struct fs *fs = bio->bi_private;
180 if (atomic_dec_and_test(&fs->sb_writes_pending))
181 wake_up(&fs->sb_writes_wait);
183 /* FIXME didn't do anything with error */
187 lafs_super_write(struct fs *fs, int dev, u64 addr, char *buf, int size)
189 struct bio *bio = bio_alloc(GFP_NOIO, 1);
190 int rw = WRITE | REQ_UNPLUG;
192 bio->bi_bdev = fs->devs[dev].bdev;
193 bio->bi_sector = addr;
194 bio_add_page(bio, virt_to_page(buf), size, offset_in_page(buf));
195 bio->bi_private = fs;
196 bio->bi_end_io = bi_write_done;
197 atomic_inc(&fs->sb_writes_pending);
202 lafs_super_wait(struct fs *fs)
204 wait_event(fs->sb_writes_wait,
205 atomic_read(&fs->sb_writes_pending) == 0
207 return 0; /* FIXME should be an error flag */
210 static int sched(void *flags)
216 void _lafs_iolock_block(struct block *b)
218 while (test_and_set_bit(B_IOLock, &b->flags)) {
220 printk("iolock wait for %s:%d: %s\n",
221 b->iolock_file, b->iolock_line,
224 wait_on_bit(&b->flags, B_IOLock,
225 sched, TASK_UNINTERRUPTIBLE);
229 int _lafs_iolock_block_async(struct block *b)
232 if (!test_and_set_bit(B_IOLock, &b->flags)) {
233 /* just got the lock! */
234 if (test_and_clear_bit(B_Async, &b->flags))
235 putref(b, MKREF(async));
238 if (test_and_set_bit(B_Async, &b->flags))
239 /* already have async set */
241 getref(b, MKREF(async));
246 lafs_iounlock_block(struct block *b)
248 /* Unlock this block, and if it is the last locked block
249 * for the page, unlock the page too.
250 * This only applied to data blocks.
253 if (test_bit(B_Index, &b->flags))
254 clear_bit(B_IOLock, &b->flags);
256 lafs_iocheck_block(dblk(b), 1);
258 wake_up_bit(&b->flags, B_IOLock);
259 if (test_bit(B_Async, &b->flags))
260 lafs_wake_thread(fs_from_inode(b->inode));
263 void lafs_writeback_done(struct block *b)
265 /* remove writeback flag on this block.
266 * If it is last on page, release page as well.
269 if (test_bit(B_Index, &b->flags)) {
270 clear_bit(B_Writeback, &b->flags);
271 wake_up_bit(&b->flags, B_Writeback);
272 if (test_bit(B_Async, &b->flags))
273 lafs_wake_thread(fs_from_inode(b->inode));
275 lafs_iocheck_writeback(dblk(b), 1);
278 void lafs_iocheck_block(struct datablock *db, int unlock)
280 struct page *page = db->page;
281 struct datablock *blist;
288 blist = (struct datablock *)page->private;
292 n = 1<<(PAGE_CACHE_SHIFT - blist->b.inode->i_blkbits);
293 bit_spin_lock(B_IOLockLock, &blist->b.flags);
295 clear_bit(B_IOLock, &db->b.flags);
296 for (i = 0 ; i < n; i++) {
297 if (test_bit(B_IOLock, &blist[i].b.flags))
300 if (!locked && test_and_clear_bit(B_HaveLock, &blist->b.flags))
302 bit_spin_unlock(B_IOLockLock, &blist->b.flags);
305 if (!PageError(page))
306 SetPageUptodate(page);
311 void lafs_iocheck_writeback(struct datablock *db, int unlock)
313 struct page *page = db->page;
314 struct datablock *blist;
321 blist = (struct datablock *)page->private;
325 n = 1<<(PAGE_CACHE_SHIFT - blist->b.inode->i_blkbits);
326 bit_spin_lock(B_IOLockLock, &blist->b.flags);
328 clear_bit(B_Writeback, &db->b.flags);
329 for (i = 0 ; i < n; i++) {
330 if (test_bit(B_Writeback, &blist[i].b.flags))
332 /* FIXME what about checking uptodate ?? */
334 if (!locked && test_and_clear_bit(B_HaveWriteback, &blist->b.flags))
336 bit_spin_unlock(B_IOLockLock, &blist->b.flags);
339 end_page_writeback(page);
341 wake_up_bit(&db->b.flags, B_Writeback);
342 if (test_bit(B_Async, &db->b.flags))
343 lafs_wake_thread(fs_from_inode(db->b.inode));
347 static int sched_valid(void *flags)
349 if (test_bit(B_Valid, flags))
357 lafs_wait_block(struct block *b)
359 if (test_bit(B_IOLock, &b->flags) &&
360 !test_bit(B_Valid, &b->flags))
361 wait_on_bit(&b->flags, B_IOLock,
362 sched_valid, TASK_UNINTERRUPTIBLE);
364 return test_bit(B_Valid, &b->flags) ? 0 : -EIO;
368 lafs_wait_block_async(struct block *b)
371 if (!test_bit(B_IOLock, &b->flags) ||
372 test_bit(B_Valid, &b->flags)) {
373 if (test_and_clear_bit(B_Async, &b->flags))
374 putref(b, MKREF(async));
375 if (test_bit(B_Valid, &b->flags))
380 if (test_and_set_bit(B_Async, &b->flags))
382 getref(b, MKREF(async));
386 static void wait_writeback(struct block *b)
388 if (test_bit(B_Writeback, &b->flags)) {
390 printk("writeback wait for %s:%d: %s\n",
391 b->iolock_file, b->iolock_line,
394 lafs_trigger_flush(b);
395 wait_on_bit(&b->flags, B_Writeback,
396 sched, TASK_UNINTERRUPTIBLE);
400 void _lafs_iolock_written(struct block *b)
402 _lafs_iolock_block(b);
406 int _lafs_iolock_written_async(struct block *b)
409 if (!test_bit(B_Writeback, &b->flags) &&
410 !test_and_set_bit(B_IOLock, &b->flags)) {
411 if (!test_bit(B_Writeback, &b->flags)) {
412 /* Have lock without writeback */
413 if (test_and_clear_bit(B_Async, &b->flags))
414 putref(b, MKREF(async));
417 /* Writeback was set by a racing thread.. */
418 lafs_iounlock_block(b);
420 lafs_trigger_flush(b);
421 if (test_and_set_bit(B_Async, &b->flags))
424 getref(b, MKREF(async));
429 block_loaded(struct bio *bio, int error)
431 struct block *b = bio->bi_private;
433 dprintk("loaded %d of %d\n", (int)b->fileaddr, (int)b->inode->i_ino);
434 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
435 set_bit(B_Valid, &b->flags); /* FIXME should I set
437 } else if (!test_bit(B_Index, &b->flags) && dblk(b)->page) {
438 ClearPageUptodate(dblk(b)->page);
439 SetPageError(dblk(b)->page);
441 dprintk("Block with no page!!\n");
442 lafs_iounlock_block(b);
446 blocks_loaded(struct bio *bio, int error)
448 struct block *bhead = bio->bi_private;
450 while (bhead->chain) {
451 struct block *b = bhead->chain;
452 bhead->chain = b->chain;
455 block_loaded(bio, error);
457 bio->bi_private = bhead;
458 block_loaded(bio, error);
462 lafs_load_block(struct block *b, struct bio *bio)
466 struct block_device *bdev;
467 struct fs *fs = fs_from_inode(b->inode);
472 if (!test_bit(B_PhysValid, &b->flags))
474 if (test_bit(B_Valid, &b->flags))
476 lafs_iolock_block(b);
477 if (test_bit(B_Valid, &b->flags)) {
478 lafs_iounlock_block(b);
481 LAFS_BUG(test_bit(B_InoIdx, &b->flags), b);
482 if (test_bit(B_Index, &b->flags)) {
483 struct indexblock *ib = iblk(b);
485 if (b->physaddr == 0) {
486 /* An empty index block. One doesn't
487 * see many of these as it means we trimmed
488 * out some blocks, but not all following
489 * block, and block in the hole is being
490 * looked for. Just Create a valid clear
493 lafs_clear_index(ib);
494 lafs_iounlock_block(b);
498 page = virt_to_page(ib->data);
499 offset = offset_in_page(ib->data);
501 struct datablock *db = dblk(b);
502 if (b->physaddr == 0) {
503 /* block is either in the inode, or
504 * non-existent (all 'nul').
506 struct lafs_inode *lai = LAFSI(b->inode);
507 void *baddr = map_dblock(db);
509 /* This case is handled in find_block */
510 LAFS_BUG(lai->depth == 0 && b->fileaddr == 0, b);
512 memset(baddr, 0, (1<<b->inode->i_blkbits));
513 unmap_dblock(db, baddr);
514 set_bit(B_Valid, &b->flags);
515 lafs_iounlock_block(b);
519 offset = dblock_offset(db);
522 virttophys(fs, b->physaddr, &dev, §);
525 lafs_iounlock_block(b);
529 bdev = fs->devs[dev].bdev;
532 bio = bio_alloc(GFP_NOIO, 1);
535 bio->bi_sector = sect;
536 bio_add_page(bio, page, fs->blocksize, offset);
539 bio->bi_end_io = block_loaded;
540 submit_bio(READ, bio);
544 LAFS_BUG(b->chain != NULL, b);
545 if (bio->bi_size == 0) {
546 bio->bi_sector = sect;
548 bio_add_page(bio, page, fs->blocksize, offset);
550 bio->bi_end_io = blocks_loaded;
553 if (bio->bi_sector + (bio->bi_size / 512) != sect
554 || bio->bi_bdev != bdev
555 || bio_add_page(bio, page, fs->blocksize, offset) == 0)
557 /* added the block successfully */
558 headb = bio->bi_private;
559 b->chain = headb->chain;
565 lafs_read_block(struct datablock *b)
569 if (test_bit(B_Valid, &b->b.flags))
572 rv = lafs_find_block(b, NOADOPT);
575 rv = lafs_load_block(&b->b, NULL);
578 return lafs_wait_block(&b->b);
582 lafs_read_block_async(struct datablock *b)
586 if (test_bit(B_Valid, &b->b.flags))
589 rv = lafs_find_block_async(b);
592 rv = lafs_load_block(&b->b, NULL);
595 return lafs_wait_block_async(&b->b);
598 /*------------------------------------------------------------------
599 * Writing filesystem blocks and cluster headers.
600 * The endio function is found from lafs_cluster_endio_choose.
601 * We need to increment the pending_cnt for this cluster and,
602 * if this is a header block, possibly for earlier clusters.
604 * Later should attempt to combine multiple blocks into the
605 * one bio ... if we can manage the bi_end_io function properly
608 static void write_block(struct fs *fs, struct page *p, int offset,
609 u64 virt, struct wc *wc, int head)
612 sector_t uninitialized_var(sect);
613 int which = wc->pending_next;
617 virttophys(fs, virt, &dev, §);
620 if (bio && virt == wc->bio_virt &&
621 bio->bi_bdev == fs->devs[dev].bdev &&
622 which == wc->bio_which &&
623 bio_add_page(bio, p, fs->blocksize, offset) > 0) {
624 /* Added the current bio - too easy */
630 int w = wc->bio_which;
631 /* need to submit the pending bio and add to pending counts */
632 atomic_inc(&wc->pending_cnt[w]);
635 if (wc->pending_vfy_type[w] == VerifyNext ||
636 wc->pending_vfy_type[w] == VerifyNext2)
637 atomic_inc(&wc->pending_cnt[w]);
639 if (wc->pending_vfy_type[w] == VerifyNext2)
640 atomic_inc(&wc->pending_cnt[w]);
643 if (wc->bio_queue && wc->bio_queue != bdev_get_queue(bio->bi_bdev))
644 blk_unplug(wc->bio_queue);
645 wc->bio_queue = bdev_get_queue(bio->bi_bdev);
646 submit_bio(WRITE, bio);
649 if (!virt && !head) {
652 blk_unplug(wc->bio_queue);
653 wc->bio_queue = NULL;
656 nr_vecs = 128; /* FIXME */
657 while (!bio && nr_vecs) {
658 bio = bio_alloc(GFP_NOIO, nr_vecs);
662 wc->bio_virt = virt + 1;
664 wc->bio_which = which;
665 bio->bi_bdev = fs->devs[dev].bdev;
666 bio->bi_sector = sect;
667 bio_add_page(bio, p, fs->blocksize, offset);
669 bio->bi_private = wc;
670 bio->bi_end_io = lafs_cluster_endio_choose(which, head);
673 void lafs_write_head(struct fs *fs, struct cluster_head *head, u64 virt,
676 write_block(fs, virt_to_page(head), offset_in_page(head),
680 void lafs_write_block(struct fs *fs, struct block *b, struct wc *wc)
682 if (test_bit(B_Index, &b->flags))
683 write_block(fs, virt_to_page(iblk(b)->data),
684 offset_in_page(iblk(b)->data),
687 write_block(fs, dblk(b)->page, dblock_offset(dblk(b)),
691 void lafs_write_flush(struct fs *fs, struct wc *wc)
693 write_block(fs, NULL, 0, 0, wc, 0);