5 * Copyright (C) 2005-2009
6 * Neil Brown <neilb@suse.de>
7 * Released under the GPL, version 2
10 * We have an index block (indirect, extent, or index), which may contain
11 * some addresses already.
12 * It has an 'uninc_table' which is a short list of addresses to
15 * Best option is that all the new addresses fit and we don't need to
16 * split or even change between indirect and extent.
17 * But some times format changing and splitting are needed.
19 * To handle format changing we generally create a new table and
20 * copy all data into it. We maintain some fast paths for the
21 * really simple updates such as indirect when all new blocks fit in the
22 * range, or index where all new ranges come at the end, and fit.
24 * In general, the first result of incorporation is a new block
25 * with new addresses, a possibly empty old block, and a possibly empty
26 * list of addresses that didn't fit into the new block.
28 * If the old block and the list of addresses are empty, we simply
29 * swizzle the block pointers and are done. If not, we have to split the
30 * index block and we need to inform the parent. If there is no parent
31 * to inform (we are an inode) then we have to grow the depth of the
35 * - When an index block is split, we need to scan all children and
36 * reparent those with a new parent.
37 * - If an index block becomes empty, we can mark it a Hole.
38 * - If the inode block has only one index, maybe we should
41 * Memory allocation issues.
42 * We are potentially in the write path for freeing memory when
43 * performing incorporation so we have to be careful about memory
45 * Sometimes we need to allocate a new page for a page split. It seems
46 * likely that the old page will be able to be written and freed soon
47 * but there is no guarantee as new pages might be dirtied at any time
48 * and so pin the index page back in memory.
49 * The key to avoiding memory exhaustion is to block the dirtying of
50 * new pages early enough... We will have to worry about that later.
53 static void sort_blocks(struct block **blkp)
55 /* sort blocks list in *blkp by ->fileaddr
65 struct block **blp[2], *b[2];
75 /* take least of b[0] and b[1]
76 * if it is larger than prev, add to
77 * blp[curr], else swap curr then add
79 while (b[0] || b[1]) {
80 if (b[next] == NULL ||
82 !((prev <= b[1-next]->fileaddr)
83 ^(prev <= b[next]->fileaddr)
84 ^(b[next]->fileaddr <= b[1-next]->fileaddr)))
88 if (b[next]->fileaddr < prev)
90 prev = b[next]->fileaddr;
92 blp[curr] = &b[next]->chain;
93 b[next] = b[next]->chain;
97 } while (bl[0] && bl[1]);
104 static int incorp_index(struct block *b)
106 /* This is an index block which we have just incorporated.
107 * Update flags accordingly.
110 clear_bit(B_Uninc, &b->flags);
111 if (test_and_clear_bit(B_UnincCredit, &b->flags)) {
115 if (test_and_clear_bit(B_PrimaryRef, &b->flags)) {
116 struct block *pb = list_entry(b->siblings.prev,
117 struct block, siblings);
118 LAFS_BUG(b->siblings.prev == &b->parent->children, b);
119 putref(pb, MKREF(primary));
120 if (!test_bit(B_EmptyIndex, &b->flags))
121 lafs_hash_iblock(iblk(b));
123 putref(b, MKREF(uninc));
127 static int incorporate_indirect(struct uninc *ui, char *buf, u32 addr, int len)
129 /* See if this uninc info can be incorporated directly into
130 * this indirect block, and if it can: do it.
135 if ( ui->pending_addr[ui->pending_cnt-1].fileaddr
136 +ui->pending_addr[ui->pending_cnt-1].cnt
140 for (i = 0; i < ui->pending_cnt; i++) {
141 u32 ad = ui->pending_addr[i].fileaddr - addr;
142 u32 cnt = ui->pending_addr[i].cnt;
143 u64 phys = ui->pending_addr[i].physaddr;
146 char *p = buf + 6*ad;
155 static int incorporate_extent(struct uninc *ui, char *buf, int size)
157 /* See if the common case of new data being added to the file
158 * applies. i.e. is there enough space in the buf to add the
159 * extents listed in ui. If so, add them. */
160 /* Find the last valid extent. */
169 b = buf + (e-1)*12 + 6;
172 last = decode32(b)+size;
177 /* there are 'e' valid extents; */
178 if (size/12 - e < ui->pending_cnt
179 || ui->pending_addr[0].fileaddr < last)
184 for (i = 0; i < ui->pending_cnt; i++)
185 if (ui->pending_addr[i].physaddr) {
186 dprintk("AddedX %llu %u %lu\n",
187 (unsigned long long)ui->pending_addr[i].physaddr,
188 (unsigned) ui->pending_addr[i].cnt,
189 (unsigned long)ui->pending_addr[i].fileaddr);
190 credits += ui->pending_addr[i].cnt;
191 encode48(buf, ui->pending_addr[i].physaddr);
192 encode16(buf, ui->pending_addr[i].cnt);
193 encode32(buf, ui->pending_addr[i].fileaddr);
198 static int incorporate_index(struct block *uninc, char *buf, int size)
200 /* see if we can merge the addresses of uninc blocks
201 * with those in buf, by counting the total number,
202 * and if they fit, merge backwards.
203 * Return -1 on failure, or number of credits collected.
218 b = buf + (icnt-1)*10;
224 for (blk = uninc ; blk ; blk = blk->chain)
228 LAFS_BUG(ucnt == 0, uninc);
229 uaddr = uninc->fileaddr;
232 while (blk || i < icnt) {
234 uaddr = blk->fileaddr;
243 if (uaddr == iaddr) {
252 } else if (uaddr < iaddr) {
253 /* New entry, unlikely to have a phys of 0... */
257 delcnt++; /* not strictly a delete, but too
258 * confusing to handle.
261 } else {/* Old unchanged entry */
266 if (ncnt > (size/10))
268 if (repcnt == icnt) {
269 /* all currently incorporated addresses are
270 * replaced or removed, so just install the
279 uninc = uninc->chain;
281 encode48(b, blk->physaddr);
282 encode32(b, blk->fileaddr);
285 credits += incorp_index(blk);
296 /* some addresses have been deleted. Too hard to do
301 /* OK, have n addresses, and we can write them from last
302 * to first without overwriting anything, so let's do it
306 struct block *t = blk;
308 uninc = uninc->chain;
317 uaddr = uninc->fileaddr;
321 b = buf + (i-1)*10 + 6;
326 if (uaddr == iaddr && uaddr == 0) {
327 /* '0' is both least value and end marker, so
328 * be careful. We know there is no deletion, so
329 * ->physaddr cannot be zero.
333 BUG_ON(uninc->physaddr == 0);
335 encode48(b, uninc->physaddr);
341 } else if (uaddr == iaddr) {
342 BUG_ON(uninc->physaddr == 0);
345 encode48(b, uninc->physaddr);
349 } else if (uaddr > iaddr) {
350 if (uninc->physaddr) {
353 encode48(b, uninc->physaddr);
370 uninc = uninc->chain;
371 credits += incorp_index(blk);
379 void lafs_clear_index(struct indexblock *ib)
381 int len = 1 << ib->b.inode->i_blkbits;
384 if (test_bit(B_InoIdx, &ib->b.flags))
385 offset = LAFSI(ib->b.inode)->metadata_size;
386 buf = map_iblock(ib);
387 memset(buf+offset, 0, len - offset);
390 *(u16 *)(buf+offset) = cpu_to_le16(IBLK_INDIRECT);
391 else if (ib->depth > 1)
392 *(u16 *)(buf+offset) = cpu_to_le16(IBLK_INDEX);
397 /* just to be on the safe side ... */
398 ((struct la_inode*)(buf))->depth = ib->depth;
400 unmap_iblock(ib, buf);
401 set_bit(B_Valid, &ib->b.flags);
404 static void grow_index_tree(struct indexblock *ib, struct indexblock *new)
406 /* leaf_incorporate or internal_incorporate was working on an inode
407 * and didn't find enough space.
408 * So demote 'ib' to a regular Index block and make 'new' a new
409 * InoIdx block (inode->iblock);
410 * We have IOLock on ib to ensure exclusivity.
413 int blksize = ib->b.inode->i_sb->s_blocksize;
414 int offset = LAFSI(ib->b.inode)->metadata_size;
417 ib->data = new->data; new->data = NULL;
418 clear_bit(B_InoIdx, &ib->b.flags);
419 lafs_hash_iblock(ib);
420 set_bit(B_InoIdx, &new->b.flags);
421 if (test_and_clear_bit(B_Root, &ib->b.flags))
422 set_bit(B_Root, &new->b.flags);
425 new->b.inode = ib->b.inode;
426 new->b.parent = ib->b.parent;
428 getiref(new, MKREF(child));
430 clear_bit(B_PhysValid, &ib->b.flags);
433 ibuf = map_iblock(new);
434 memcpy(ib->data, ibuf + offset, blksize - offset);
435 memset(ib->data + blksize - offset, 0, offset);
436 LAFS_BUG(ib->depth == 0, &ib->b);
437 new->depth = ib->depth + 1;
438 LAFSI(new->b.inode)->depth++;
439 ((struct la_inode*)(ibuf))->depth = LAFSI(new->b.inode)->depth;
441 /* There is no need to set PrimaryRef as the first child in an
442 * index block is implicitly incorporated - at least enough to be
443 * found. So the ->parent link holds our place in the tree
445 unmap_iblock(new, ibuf);
447 LAFS_BUG(LAFSI(new->b.inode)->iblock != ib, &ib->b);
448 LAFSI(new->b.inode)->iblock = new;
449 list_add(&ib->b.siblings, &new->children);
450 INIT_LIST_HEAD(&new->b.siblings);
452 LAFS_BUG(new->depth != LAFSI(new->b.inode)->depth, &ib->b);
453 lafs_clear_index(new);
455 /* Make sure pin count is correct */
456 LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
457 set_bit(B_Pinned, &new->b.flags);
458 set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
459 atomic_set(&new->pincnt[!!test_bit(B_Phase1, &ib->b.flags)], 1);
462 static void print_index(char *buf, u32 start, int len)
464 int type = le16_to_cpu(*(u16 *)(buf));
473 printk(" Indirect:\n");
477 printk(" %u:%lu", start, (unsigned long)phys);
484 printk(" Extent:\n");
490 printk(" %u-%u: %llu\n", addr, addr+size-1,
491 (unsigned long long) phys);
501 printk(" %u:%lu ", addr, (unsigned long)phys);
509 void lafs_print_uninc(struct uninc *ui)
512 printk(" pending=%d\n", ui->pending_cnt);
513 for (i = 0; i < ui->pending_cnt; i++)
514 printk(" [%d] %u-%u : %lu\n", i,
515 ui->pending_addr[i].fileaddr,
516 ui->pending_addr[i].fileaddr
517 +ui->pending_addr[i].cnt - 1,
518 (unsigned long)ui->pending_addr[i].physaddr);
524 u32 nextaddr; /* on failure, set to the first address that doesn't fit */
531 static int add_index(void *data, u32 addr, u64 phys)
533 struct layoutinfo *li = data;
538 encode16(cp, IBLK_INDEX);
552 /* Each entry is 10 bytes: 6byte dev, 4byte file */
561 static int add_indirect(void *data, u32 addr, u64 phys, int len)
563 struct layoutinfo *li = data;
572 encode16(p, IBLK_INDIRECT);
578 return 0; /* finalise */
580 p = li->data + (addr - li->lastaddr) * 6;
581 lastaddr = li->lastaddr + (li->size/6);
582 for (i = 0; i < len && addr < lastaddr ; i++) {
592 static int add_extent(void *data, u32 addr, u64 phys, int len)
594 struct layoutinfo *li = data;
600 encode16(p, IBLK_EXTENT);
608 /* close off an extent if there is one */
609 dprintk("Finalise: %d\n", (int)li->esize);
612 encode16(p, li->esize);
618 if (li->esize && li->lastaddr == addr &&
619 li->lastphys == phys) {
620 /* just extend the extent */
621 dprintk("Extending %llu %u %lu\n",
622 (unsigned long long)phys,
624 (unsigned long)addr);
631 /* need to close old extent */
633 encode16(p, li->esize);
638 dprintk("Extent says 'no room'\n");
639 li->nextaddr = li->lastaddr;
646 dprintk("Added %llu %u %lu\n",
647 (unsigned long long)phys,
649 (unsigned long)addr);
653 li->lastaddr = addr + len;
654 li->lastphys = phys + len;
665 int nofit; /* bit mask of which layouts don't fit */
670 static int check_leaf(void *data, u32 addr, u64 phys, int len)
672 struct leafinfo *li = data;
675 /* We are gathering data to see whether an indirect block or
676 * an extent block is best - depending on which fills the block
677 * first (that one is not good choice).
678 * We also see if either will fit at all.
679 * For indirect, we give up as soon as an address will not fit in
681 * For extent, we track how many extents will be needed, and give up
682 * when that exceeds the size of the block.
686 /* We are initialising the structure */
687 memset(li, 0, sizeof(*li));
688 li->firstaddr = addr;
689 li->blksize = len - 2; /* exclude type field */
693 return 0; /* nothing to finalise */
696 /* Check for indirect layout */
697 if (((addr + len) - li->firstaddr) * 6 > li->blksize) {
698 /* Indirect no longer fits */
700 li->choice = IBLK_EXTENT;
701 li->nofit |= (1 << IBLK_INDIRECT);
703 /* Check for extent layout */
704 if (addr == li->nextaddr && phys == li->nextphys) {
705 /* This fits in the current extent, at least partly */
707 if (li->esize + size >= 0x10000) {
708 /* extent too large */
709 len = 0x10000 - li->esize - 1;
712 li->nextaddr += size;
713 li->nextphys += size;
719 /* need a new extent */
721 if (li->extents * 12 > li->blksize) {
722 /* But it won't fit */
724 li->choice = IBLK_INDIRECT;
725 li->nofit |= (1 << IBLK_EXTENT);
727 li->nextaddr = addr+1;
728 li->nextphys = phys+1;
732 (li->nofit & (1 << IBLK_INDIRECT)) &&
733 (li->nofit & (1 << IBLK_EXTENT)))
739 static u32 walk_index(u32 addr, char **bufp, int len, struct block *uninc,
740 int (*handle)(void*, u32, u64),
743 /* Pass each entry in buf or uninc to "handle" (which is
745 * Must pass addresses in order.
746 * 'handle' returns 1 if the address was added.
747 * We return 1 if all addressed handled, else 0.
749 * Entries are 10 bytes: 6 byte dev address, 4 byte file address.
754 handle(data, addr, 0); /* initialise */
756 while ((len >= 10 && !found_end) || uninc != NULL) {
757 unsigned long addr = 0;
761 phys = decode48(buf);
762 addr = decode32(buf);
769 (phys == 0 || uninc->fileaddr <= addr)) {
770 /* New block at or before current address */
771 if (uninc->physaddr &&
772 !handle(data, uninc->fileaddr, uninc->physaddr)) {
773 if (addr > uninc->fileaddr)
778 if (uninc->fileaddr == addr)
779 /* Don't record the old value from the index */
781 uninc = uninc->chain;
784 if (phys && !handle(data, addr, phys)) {
789 handle(data, 0, ~(u64)0); /* finalise */
793 static u32 walk_indirect(u32 addr, char **bufp, int len, struct uninc *ui,
794 int (*handle)(void*, u32, u64, int),
797 /* Pass each entry in buf or ui to "handle".
798 * Must pass addresses in order.
799 * 'handle' returns number of addresses handled.
800 * If this isn't all, we stop and return 0
801 * else 1 if all are handled.
807 handle(data, addr, 0, len); /* initialise */
809 while (len >= 6 || uinum < ui->pending_cnt) {
813 phys = decode48(buf);
815 } else if (uinum < ui->pending_cnt &&
816 addr < ui->pending_addr[uinum].fileaddr + uioffset)
817 /* Skip ahead quickly. */
818 addr = ui->pending_addr[uinum].fileaddr + uioffset;
820 if (uinum < ui->pending_cnt &&
821 ui->pending_addr[uinum].fileaddr + uioffset == addr) {
822 phys = ui->pending_addr[uinum].physaddr + uioffset;
824 if (uioffset >= ui->pending_addr[uinum].cnt) {
828 /* FIXME what if pending address ranges overlap !! */
829 /* FIXME should an pending_addr range of holes be allowed?*/
830 /* FIXME should we check that addr is never > fileaddr+offset ? */
833 if (!handle(data, addr, phys, 1)) {
841 handle(data, 0, ~(u64)0, 0); /* finalise */
845 static u32 walk_extent(u32 addr, char **bufp, int len, struct uninc *ui,
846 int (*handle)(void*, u32, u64, int),
849 /* pass each extent in buf or ui to handle. Extents
850 * can overlap, so care is needed
861 handle(data, addr, 0, len); /* initialise */
863 while (uinum < ui->pending_cnt || ! found_end) {
864 if (elen == 0 && !found_end) {
866 ephys = decode48(buf);
867 elen = decode16(buf);
868 eaddr = decode32(buf);
870 BUG_ON(ephys == 0 && elen != 0); // FIXME fail gracefully
871 dprintk("Found %llu %u %lu at %d\n",
872 (unsigned long long) ephys,
874 (unsigned long)eaddr, len);
876 eaddr = 0xFFFFFFFFUL;
880 eaddr = 0xFFFFFFFFUL;
886 /* Handle all pending extents that start before (or at) here */
887 while (uinum < ui->pending_cnt &&
888 ui->pending_addr[uinum].fileaddr + uioffset <= eaddr &&
889 (elen > 0 || eaddr == 0xFFFFFFFFUL)
891 /* just submit next extent from ui */
893 int hlen = ui->pending_addr[uinum].cnt - uioffset;
894 /* If there is an overlap with the current extent, only
895 * add the overlapped portion.
897 dprintk("(%llu %lu %u) (%llu %lu %u) + %d = %d (%d)\n",
898 ephys, (unsigned long)eaddr, elen,
899 ui->pending_addr[uinum].physaddr,
900 (unsigned long)ui->pending_addr[uinum].fileaddr,
901 ui->pending_addr[uinum].cnt,
902 uioffset, hlen, uinum);
904 ui->pending_addr[uinum].fileaddr + uioffset + hlen
906 hlen = (eaddr + elen) -
907 (ui->pending_addr[uinum].fileaddr + uioffset);
908 if (ui->pending_addr[uinum].physaddr)
909 handled = handle(data,
910 ui->pending_addr[uinum].fileaddr + uioffset,
911 ui->pending_addr[uinum].physaddr + uioffset,
915 /* That might replace some of current extent */
916 overlap = (ui->pending_addr[uinum].fileaddr + uioffset +
918 if (elen && overlap > 0) {
925 if (handled < hlen) {
933 if (uioffset >= ui->pending_addr[uinum].cnt) {
938 BUG_ON(elen && uioffset);
939 /* If uninc extent intersects this extent, just submit
940 * partial extent and loop
942 if (elen && uinum < ui->pending_cnt &&
943 ui->pending_addr[uinum].fileaddr < eaddr + elen) {
944 int elen2 = ui->pending_addr[uinum].fileaddr - eaddr;
945 handled = handle(data, eaddr, ephys, elen2);
946 if (handled < elen2) {
954 /* Ok, just submit this extent */
955 handled = handle(data, eaddr, ephys, elen);
956 if (handled < elen) {
965 handle(data, 0, ~(u64)0, 0); /* finalise */
969 void lafs_walk_leaf_index(struct indexblock *ib,
970 int (*handle)(void*, u32, u64, int),
977 struct fs *fs = fs_from_inode(ib->b.inode);
981 /* Cannot use kmap_atomic as handle might sleep when
982 * looking up segusage blocks
984 if (test_bit(B_InoIdx, &ib->b.flags)) {
985 ibuf = kmap(LAFSI(ib->b.inode)->dblock->page);
986 ibuf += dblock_offset(LAFSI(ib->b.inode)->dblock);
988 ibuf = map_iblock(ib);
989 if (test_bit(B_InoIdx, &ib->b.flags))
990 offset = LAFSI(ib->b.inode)->metadata_size;
993 len = fs->blocksize - offset;
995 current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
996 buf = ibuf + offset + 2;
997 dprintk("CURRENT=%d\n", current_layout);
998 switch (current_layout) {
1000 walk_indirect(ib->b.fileaddr,
1005 walk_extent(ib->b.fileaddr,
1010 printk("CURRENT=%d\n", current_layout);
1011 LAFS_BUG(1, &ib->b); // FIXME should be IO error ??
1013 if (test_bit(B_InoIdx, &ib->b.flags))
1014 kunmap(LAFSI(ib->b.inode)->dblock->page);
1017 static void share_list(struct block **ibp, struct block **newp, u32 next)
1019 struct block *uin = *ibp;
1022 struct block *b = uin;
1025 if (b->fileaddr < next) {
1037 static void share_uninc(struct uninc *from, struct uninc *to,
1040 /* Any addresses in 'from' that are at-or-after 'next' go to 'to'.
1041 * Credits need to be shared somehow too...
1044 struct addr *ia, *ja;
1048 for (i = 0; i < from->pending_cnt; i++) {
1049 ia = &from->pending_addr[i];
1052 /* This one stays put */
1054 if (ia->fileaddr < next) {
1055 /* This one gets split */
1056 j = to->pending_cnt;
1057 ja = &to->pending_addr[j];
1058 len = next - ia->fileaddr;
1060 ja->fileaddr = next;
1061 ja->cnt = ia->cnt - len;
1062 ja->physaddr = ia->physaddr + len;
1063 moved += ia->cnt - len;
1066 to->pending_cnt = j+1;
1069 /* This one gets moved across. */
1070 j = to->pending_cnt;
1071 ja = &to->pending_addr[j];
1073 to->pending_cnt = j+1;
1076 /* Move the last one into this spot, and try again */
1077 j = from->pending_cnt-1;
1078 ja = &from->pending_addr[j];
1080 from->pending_cnt = j;
1083 /* FIXME this doesn't make sense.
1084 * if the test can fail, we must have a better answer
1086 if (from->credits >= moved) {
1087 to->credits += moved;
1088 from->credits -= moved;
1090 int c = from->credits / 2;
1097 static void share_children(struct indexblock *ib, struct indexblock *new)
1099 /* Some of the children for *ib must now become children
1100 * of *new. When moving children we must be careful that
1101 * blocks with B_PrimaryRef stay with their primary,
1102 * or drop the flag and the reference
1104 struct block *b, *tmp;
1106 list_for_each_entry_safe(b, tmp, &ib->children, siblings) {
1107 if (b->fileaddr == new->b.fileaddr &&
1108 test_bit(B_Index, &b->flags) &&
1109 test_and_clear_bit(B_PrimaryRef, &b->flags)) {
1110 /* This block had a 'Primary' reference on
1111 * the previous block. We must drop that
1112 * before it can be moved.
1114 putref(list_entry(b->siblings.prev,
1115 struct block, siblings),
1118 if (b->fileaddr >= new->b.fileaddr) {
1119 list_move_tail(&b->siblings, &new->children);
1121 getiref(new, MKREF(child));
1122 if (test_bit(B_Pinned, &b->flags)) {
1123 int ph = test_bit(B_Phase1, &b->flags);
1124 atomic_inc(&new->pincnt[ph]);
1125 atomic_dec(&ib->pincnt[ph]);
1127 putiref(ib, MKREF(child));
1132 static int do_incorporate_leaf(struct fs *fs, struct indexblock *ib,
1134 struct indexblock *new)
1136 /* Incorporate all the changes in ib->uninc_table into ib,
1137 * with any new address appearing in new.
1138 * Return value is one of:
1139 * 0: uninc_table removed everything, block is now empty.
1140 * 1: everything fits in 'ib'. New can be discarded. uninc_table
1142 * 2: the addresses are in ib, new, and possibly new->uninc_table.
1143 * 'new' has ->fileaddr set.
1144 * 3: ib is InoIdx and the addresses didn't all fit, so everything
1145 * was copied to new and ib is now an IBLK_INDEX block pointing
1146 * at new. 'new' still needs incorporation.
1148 * We first simulate laying out the info in both
1149 * Indirect and Extent layouts and choose the best.
1150 * Then we perform the layout into new. Then we swap data
1151 * between ib and new as appropriate and return the result.
1154 char *buf, *b2, *sbuf;
1157 struct leafinfo leafinfo;
1160 struct layoutinfo layout;
1169 ibuf = map_iblock(ib);
1170 if (test_bit(B_InoIdx, &ib->b.flags))
1171 offset = LAFSI(ib->b.inode)->metadata_size;
1174 len = fs->blocksize - offset;
1176 check_leaf(&leafinfo, ib->b.fileaddr, 0, len);
1178 current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
1179 buf = ibuf + offset + 2;
1180 switch (current_layout) {
1182 walk_indirect(ib->b.fileaddr,
1184 check_leaf, &leafinfo);
1187 walk_extent(ib->b.fileaddr,
1189 check_leaf, &leafinfo);
1192 printk("Current is %d\n", current_layout);
1193 LAFS_BUG(1, &ib->b); // FIXME should be IO error ??
1196 if (leafinfo.cnt == 0) {
1197 /* no addresses need to be stored here. */
1198 unmap_iblock(ib, ibuf);
1201 choice = leafinfo.choice ?: IBLK_EXTENT; /* default to Extents */
1202 if (offset && (leafinfo.nofit & (1<<choice))) {
1203 /* addresses won't fit in this inode, so copy into new,
1204 * and set inode up as an index block
1206 unmap_iblock(ib, ibuf);
1207 grow_index_tree(ib, new);
1211 nbuf = map_iblock_2(new);
1214 memset(nbuf, 0, fs->blocksize);
1215 set_bit(B_Valid, &new->b.flags);
1217 buf = ibuf + offset + 2;
1218 switch (current_layout) {
1220 ok = walk_indirect(ib->b.fileaddr,
1222 (choice == IBLK_EXTENT)
1223 ? add_extent : add_indirect,
1227 ok = walk_extent(ib->b.fileaddr,
1229 (choice == IBLK_EXTENT)
1230 ? add_extent : add_indirect,
1235 dprintk("walk_leaf only got as far as %d\n", (int)layout.nextaddr);
1236 // print_index(ibuf+offset, ib->b.fileaddr, len);
1238 /* it all fit perfectly.
1239 * Copy from 'new' into 'ib' and zero the uninc list
1241 //printk("Offset=%d len=%d\n", offset, len);
1242 memcpy(ibuf+offset, nbuf, len);
1243 unmap_iblock_2(new, nbuf);
1244 unmap_iblock(ib, ibuf);
1247 next = layout.nextaddr;
1248 LAFS_BUG(offset, &ib->b);
1250 /* It didn't fit, and this is a regular index block, so
1251 * we need to do a horizontal split.
1252 * 'new' already has the early addresses.
1253 * All addresses before 'next' in 'ui' and 'ib' need to be
1255 * new->fileaddr must be set to 'next'.
1256 * data buffers for new and ib need to be swapped
1259 /* There is nowhere that we can safely put any index info
1260 * that is still in 'ui' except into the new block with the
1261 * remains of the 'ib' addresses.
1262 * It had better fit. And it will.
1263 * Here are the cases for current_layout and choice.
1264 * INDIRECT -> INDIRECT
1265 * The start of ib hasn't changed, so all the addresses that
1266 * were there are now in new. So ib is empty and can fit everything
1267 * in ui, as ui is <<< ib
1269 * INDIRECT -> EXTENT
1270 * More fitted as EXTENTS, so we must have included all remaining
1271 * addressed from ib, so only some ui addresses remain.
1273 * If extents have been added into new, there can have been at most
1274 * two for each extent in ui (the new extent, and the extra piece from
1275 * a split extent). So worst case there are 16 extents left to encode,
1276 * twice the number in ui.
1277 * EXTENT -> INDIRECT
1278 * The INDIRECT must be storing more addresses than the EXTENT option,
1279 * so there is even less remainder to be encoded.
1281 * In each case, the remainder of ib must use less than half the space,
1282 * and it must be in EXTENT format if it is not empty.
1283 * So we can move them up to the top of the buffer, then merge with
1284 * anything remaining in ui.
1287 /* Shuffle remaining extents in ui down so they are easy to access */
1289 for (uinum = 0; uinum < ui->pending_cnt; uinum++) {
1290 if (ui->pending_addr[uinum].cnt == 0)
1292 if (ui->pending_addr[uinum].fileaddr
1293 + ui->pending_addr[uinum].cnt < next)
1295 if (ui->pending_addr[uinum].fileaddr < next) {
1296 int cnt = ui->pending_addr[uinum].cnt
1297 - (next - ui->pending_addr[uinum].fileaddr);
1298 ui->pending_addr[uinxt].physaddr =
1299 ui->pending_addr[uinum].physaddr
1300 + (next - ui->pending_addr[uinum].fileaddr);
1301 ui->pending_addr[uinxt].fileaddr = next;
1302 ui->pending_addr[uinxt].cnt = cnt;
1304 ui->pending_addr[uinxt].fileaddr =
1305 ui->pending_addr[uinum].fileaddr;
1306 ui->pending_addr[uinxt].physaddr =
1307 ui->pending_addr[uinum].physaddr;
1308 ui->pending_addr[uinxt].cnt =
1309 ui->pending_addr[uinum].cnt;
1313 ui->pending_cnt = uinxt;
1315 switch (current_layout) {
1317 /* buf must now be effectively empty, and there must
1318 * be at least one extent still in ui.
1319 * So zero the buff and encode those extents as IBLK_EXTENT
1321 LAFS_BUG(next < ib->b.fileaddr + (len-2)/6, &ib->b);
1322 LAFS_BUG(ui->pending_cnt == 0, &ib->b);
1323 memset(ibuf, 0, len);
1328 /* The combination of the remaining extents in buf, and
1329 * those in ui will not fill buf.
1330 * So shift the buf extents up to the top of the buffer,
1331 * then merge the remaining ui extents in.
1332 * The first extent may be partially processed already, so
1333 * we might need to adjust it.
1335 /* Find end of index list. Could use a binary search,
1339 while (b2 + 12 <= ibuf + len) {
1340 ephys = decode48(b2);
1341 elen = decode16(b2);
1342 eaddr = decode32(b2);
1348 int handled = next - eaddr;
1349 BUG_ON(eaddr + elen <= next);
1352 encode48(b2, ephys + handled);
1353 encode16(b2, elen - handled);
1357 /* Data we want extends from buf up to b2
1358 * Move it to end of ibuf
1361 sbuf = ibuf + len - slen;
1362 memmove(sbuf, buf, slen);
1365 LAFS_BUG(1, &ib->b);
1369 ok = walk_extent(next, &sbuf, slen, ui, add_extent, &layout);
1370 LAFS_BUG(!ok, &ib->b);
1371 if (slen && layout.data > sbuf) {
1372 printk("slen=%d ld-sb=%d layout.data=%p sbuf=%p "
1373 "buf=%p ibuf=%p len=%d\n",
1374 slen, (int)(layout.data-sbuf), layout.data, sbuf,
1377 LAFS_BUG(slen && layout.data > sbuf, &ib->b);
1378 memset(layout.data, 0, layout.size);
1380 new->b.fileaddr = next;
1383 unmap_iblock_2(new, nbuf);
1384 unmap_iblock(ib, ibuf);
1386 /* new needs to be a sibling of ib, and children need to be shared out,
1387 * distributing pincnt appropriately.
1388 * uninc_next need to be shared too
1390 share_children(ib, new);
1392 spin_lock(&ib->b.inode->i_data.private_lock);
1394 share_list(&ib->uninc_next, &new->uninc_next, next);
1395 new->uninc_table.pending_cnt = 0;
1396 new->uninc_table.credits = 0;
1397 share_uninc(&ib->uninc_table, &new->uninc_table, next);
1398 spin_unlock(&ib->b.inode->i_data.private_lock);
1400 new->depth = ib->depth;
1401 if (ib->b.siblings.next != &ib->b.parent->children &&
1402 test_bit(B_PrimaryRef, &list_entry(ib->b.siblings.next,
1403 struct block, siblings)->flags))
1404 /* Inserting into a PrimaryRef chain */
1405 getiref(new, MKREF(primary));
1407 getiref(ib, MKREF(primary));
1408 set_bit(B_PrimaryRef, &new->b.flags);
1409 list_add(&new->b.siblings, &ib->b.siblings);
1410 new->b.parent = ib->b.parent;
1411 (void)getiref(new->b.parent, MKREF(child));
1412 new->b.inode = ib->b.inode;
1413 LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
1414 set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
1418 static int do_incorporate_internal(struct fs *fs, struct indexblock *ib,
1419 struct block *uninc, int *credits,
1420 struct indexblock *new)
1422 /* Incorporate all the changes in 'uninc' into ib,
1423 * with any new address appearing in new.
1424 * Return value is one of:
1425 * 0: uninc removed everything, block is now empty.
1426 * 1: everything fits in 'ib'. New can be discarded. uninc_table
1428 * 2: the addresses are in ib, new, and possibly new->uninc
1429 * 'new' has ->fileaddr set.
1430 * 3: ib is InoIdx and the addresses didn't all fit, so everything
1431 * was copied to new and ib is now an IBLK_INDEX block pointing
1432 * at new. 'new' still needs incorporation.
1440 struct layoutinfo layout;
1445 ibuf = map_iblock(ib);
1446 if (test_bit(B_InoIdx, &ib->b.flags))
1447 offset = LAFSI(ib->b.inode)->metadata_size;
1450 len = fs->blocksize - offset;
1452 current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
1453 buf = ibuf + offset + 2;
1454 LAFS_BUG(current_layout != IBLK_INDEX,
1457 nbuf = map_iblock_2(new);
1460 memset(nbuf, 0, fs->blocksize);
1462 ok = walk_index(ib->b.fileaddr,
1467 dprintk("walk_index only got as far as %d\n", (int)next);
1470 /* it all fit perfectly.
1471 * Copy from 'new' into 'ib' and free all uninc blocks
1473 //printk("Offset=%d len=%d\n", offset, len);
1474 memcpy(ibuf+offset, nbuf, len);
1475 unmap_iblock_2(new, nbuf);
1476 unmap_iblock(ib, ibuf);
1480 struct block *b = uninc;
1481 uninc = uninc->chain;
1482 cr += incorp_index(b);
1489 unmap_iblock(ib, ibuf);
1490 grow_index_tree(ib, new);
1493 next = layout.nextaddr;
1494 /* It didn't fit, and this is a regular index block, so
1495 * we need to do a horizontal split.
1496 * 'new' already has the early addresses.
1497 * All addresses before 'next' in 'ib' need to be
1499 * new->fileaddr must be set to 'next'.
1500 * data buffers for new and ib need to be swapped
1503 /* shuffle ibuf down.
1504 * 'buf' is the starting point.
1506 memcpy(ibuf+2, buf, ibuf + len - buf);
1507 memset(ibuf+2 + len - (buf - ibuf), 0, buf - (ibuf+2));
1509 new->b.fileaddr = next;
1512 unmap_iblock_2(new, nbuf);
1513 unmap_iblock(ib, ibuf);
1514 set_bit(B_Valid, &new->b.flags);
1516 /* new needs to be a sibling of ib, and children need to be shared out,
1517 * distributing pincnt appropriately.
1518 * All of uninc that is remaining goes to 'new', while
1519 * ib->uninc and ib->uninc_next need to be shared out.
1520 * We need to be careful to preserve the relative order of children
1521 * as they might not all be incorporated yet.
1523 share_children(ib, new);
1527 while (uninc && uninc->fileaddr < next) {
1528 struct block *b = uninc;
1529 uninc = uninc->chain;
1530 cr += incorp_index(b);
1535 spin_lock(&ib->b.inode->i_data.private_lock);
1537 share_list(&ib->uninc_next, &new->uninc_next, next);
1539 share_list(&ib->uninc, &new->uninc, next);
1540 spin_unlock(&ib->b.inode->i_data.private_lock);
1542 new->depth = ib->depth;
1543 if (ib->b.siblings.next != &ib->b.parent->children &&
1544 test_bit(B_PrimaryRef, &list_entry(ib->b.siblings.next,
1545 struct block, siblings)->flags))
1546 /* Inserting into a PrimaryRef chain */
1547 getiref(new, MKREF(primary));
1549 getiref(ib, MKREF(primary));
1550 set_bit(B_PrimaryRef, &new->b.flags);
1551 list_add(&new->b.siblings, &ib->b.siblings);
1552 new->b.parent = ib->b.parent;
1553 (void)getiref(new->b.parent, MKREF(child));
1554 new->b.inode = ib->b.inode;
1555 LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
1556 set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
1558 if (new->uninc->fileaddr == next) {
1560 /* This block really needs to be incorporated, or
1561 * we might not be able to find it - it might be
1562 * a primary for other blocks too.
1563 * So do a minimal incorporation of this block.
1564 * That must succeed as it will be an addition to an
1565 * empty block, or a replacement.
1568 new->uninc = b->chain;
1570 nbuf = map_iblock(new);
1571 cr = incorporate_index(b, nbuf, len);
1572 unmap_iblock(new, nbuf);
1573 LAFS_BUG(cr < 0, &ib->b);
1580 static void filter_empties(struct block **uninc)
1582 /* If any block has phys==0 and there is another block
1583 * with same address, incorp_index the phys==0 block
1585 struct block *b = *uninc;
1586 while (b && b->chain) {
1587 if (b->fileaddr == b->chain->fileaddr) {
1589 if (b->physaddr == 0) {
1592 } else if (b->chain->physaddr == 0) {
1594 b->chain = t->chain;
1606 /* Incorporate all the addresses in uninc_table into this
1607 * indexblock. Each address carried a credit to allow for
1608 * indexblock splitting etc.
1609 * The block is already Dirty if it should go in the main
1610 * cluster, or Realloc if it should go in a cleaning cluster.
1612 void lafs_incorporate(struct fs *fs, struct indexblock *ib)
1614 int offset; /* start of block where indexing is */
1615 struct indexblock *new = NULL;
1616 struct block *uninc = NULL;
1620 int blocksize = fs->blocksize;
1622 LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
1624 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1625 !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1627 LAFS_BUG(!test_bit(B_Valid, &ib->b.flags) && ib->depth > 0, &ib->b);
1629 if (ib->depth <= 1) {
1630 /* take a copy of the uninc_table so we can work while
1631 * more changes can be made
1638 struct block *x = ib->uninc;
1640 printk("x=%s\n", strblk(x));
1644 LAFS_BUG(ib->uninc, &ib->b);
1646 spin_lock(&ib->b.inode->i_data.private_lock);
1647 memcpy(&uit, &ib->uninc_table, sizeof(uit));
1648 ib->uninc_table.pending_cnt = 0;
1649 ib->uninc_table.credits = 0;
1650 spin_unlock(&ib->b.inode->i_data.private_lock);
1652 if (test_bit(B_InoIdx, &ib->b.flags)) {
1653 offset = LAFSI(ib->b.inode)->metadata_size;
1654 if (LAFSI(ib->b.inode)->depth == 0) {
1655 /* data has already been copied
1656 * out.. I hope - FIXME */
1657 LAFSI(ib->b.inode)->depth = 1;
1659 lafs_clear_index(ib);
1664 buf = map_iblock(ib) + offset;
1667 start = decode32(b);
1669 if (uit.pending_cnt == 0) {
1670 unmap_iblock(ib, buf-offset);
1673 /* Check it really is sorted */
1676 for (i=1; i<uit.pending_cnt; i++)
1677 if (uit.pending_addr[i].fileaddr <
1678 uit.pending_addr[i-1].fileaddr +
1679 uit.pending_addr[i-1].cnt)
1683 if(!test_bit(B_Dirty, &ib->b.flags) &&
1684 !test_bit(B_Realloc, &ib->b.flags))
1685 printk("bad %s\n", strblk(&ib->b));
1686 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1687 !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1689 /* OK, we genuinely have work to do. */
1690 /* find size and type of current block */
1692 /* Maybe a direct update of an indirect block */
1693 if (type == IBLK_INDIRECT &&
1694 incorporate_indirect(&uit, buf+2,
1696 blocksize-(offset+2))) {
1697 unmap_iblock(ib, buf-offset);
1701 /* Maybe a fairly direct update of an extent block */
1702 if (type == IBLK_EXTENT &&
1703 incorporate_extent(&uit, buf+2,
1704 blocksize-(offset+2))) {
1705 unmap_iblock(ib, buf-offset);
1709 dprintk("Index contains:\n");
1711 print_index(buf, ib->b.fileaddr, blocksize - offset);
1712 dprintk("uninc contains:\n");
1714 lafs_print_uninc(&uit);
1716 unmap_iblock(ib, buf-offset);
1722 LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
1723 spin_lock(&ib->b.inode->i_data.private_lock);
1726 spin_unlock(&ib->b.inode->i_data.private_lock);
1731 sort_blocks(&uninc);
1732 filter_empties(&uninc);
1734 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1735 !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1737 /* OK, we genuinely have work to do. */
1738 /* find size and type of current block */
1739 buf = map_iblock(ib);
1741 if (test_bit(B_InoIdx, &ib->b.flags)) {
1742 offset = LAFSI(ib->b.inode)->metadata_size;
1747 dprintk("Index contains:\n");
1751 print_index(buf, ib->b.fileaddr, blocksize - offset);
1752 printk("uninc list:\n");
1753 for (b = uninc; b ; b=b->chain)
1754 printk(" %lu: %llu\n",
1755 (unsigned long) b->fileaddr,
1756 (unsigned long long) b->physaddr);
1759 /* internal index block. Might be able to merge in-place */
1760 if (*(u16 *)(buf) == cpu_to_le16(IBLK_INDEX) &&
1761 (cred = incorporate_index(uninc, buf+2,
1762 blocksize-(offset+2))) >= 0) {
1763 unmap_iblock(ib, buf-offset);
1767 unmap_iblock(ib, buf-offset);
1770 /* Ok, those were the easy options. Now we need to allocate a
1774 new = lafs_iblock_alloc(fs, GFP_NOFS, 1, MKREF(inc));
1775 /* FIXME need to preallocate something for a fall-back?? */
1777 /* We lock the block despite that fact that it isn't
1778 * shared, so that map_iblock won't complain */
1779 lafs_iolock_block(&new->b);
1782 printk("small depth %s\n", strblk(&ib->b));
1783 LAFS_BUG(ib->depth < 1, &ib->b);
1785 rv = do_incorporate_leaf(fs, ib, &uit, new);
1787 rv = do_incorporate_internal(fs, ib, uninc, &uit.credits, new);
1791 /* There is nothing in this block any more.
1792 * If it is an inode, clear it, else punch a hole
1794 dprintk("incorp to empty off=%d %s\n",
1795 (int)offset, strblk(&ib->b));
1796 lafs_iounlock_block(&new->b);
1797 lafs_iblock_free(new);
1798 lafs_clear_index(ib);
1801 case 1: /* everything was incorporated - hurray */
1802 lafs_iounlock_block(&new->b);
1803 lafs_iblock_free(new);
1804 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1805 !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1806 /* Don't need to dirty, it is already dirty */
1809 case 2: /* Simple split */
1810 /* The addresses are now in 'ib', 'new' and possibly
1811 * new->uninc_table. 'new' has been linked in to the
1815 set_bit(B_Credit, &new->b.flags);
1816 if (uit.credits > 0) {
1817 set_bit(B_ICredit, &new->b.flags);
1821 lafs_dirty_iblock(new, !test_bit(B_Dirty, &ib->b.flags));
1822 lafs_iounlock_block(&new->b);
1823 temp_credits = uit.credits;
1824 putiref(new, MKREF(inc));
1826 case 3: /* Need to grow */
1827 /* new needs a B_Credit and a B_ICredit.
1831 set_bit(B_Credit, &new->b.flags);
1832 if (uit.credits > 0){
1833 set_bit(B_ICredit, &new->b.flags);
1837 lafs_dirty_iblock(new, !test_bit(B_Dirty, &ib->b.flags));
1838 dprintk("Just Grew %s\n", strblk(&new->b));
1839 dprintk(" from %s\n", strblk(&ib->b));
1840 lafs_iounlock_block(&new->b);
1841 temp_credits = uit.credits;
1842 putiref(new, MKREF(inc));
1848 if (uit.credits > 0 && !test_and_set_bit(B_UnincCredit, &ib->b.flags))
1850 if (uit.credits > 0 && !test_and_set_bit(B_ICredit, &ib->b.flags))
1852 if (uit.credits > 0 && !test_and_set_bit(B_NICredit, &ib->b.flags))
1854 if (uit.credits < 0) {
1855 printk("Credits = %d, rv=%d\n", uit.credits, rv);
1856 printk("ib = %s\n", strblk(&ib->b));
1859 lafs_space_return(fs, uit.credits);
1861 /* If this index block is now empty and has no
1862 * children which are not EmptyIndex, we flag it
1863 * as EmptyIndex so index lookups know to avoid it.
1864 * Dirty children also delay empty-handling. This
1865 * is important at the InoIdx level as we cannot drop
1866 * level to 1 while there are still children that
1867 * might want to be incorporated.
1869 if (ib->depth > 1 && ! lafs_index_empty(ib))
1871 if (ib->depth > 1) {
1874 spin_lock(&ib->b.inode->i_data.private_lock);
1875 list_for_each_entry(b, &ib->children, siblings)
1876 if (!test_bit(B_EmptyIndex, &b->flags) ||
1877 test_bit(B_Dirty, &b->flags) ||
1878 test_bit(B_Realloc, &b->flags)) {
1882 spin_unlock(&ib->b.inode->i_data.private_lock);
1886 if (test_bit(B_InoIdx, &ib->b.flags)) {
1887 /* Empty InoIdx blocks are allowed. However depth must
1888 * be 1. This is where we suddenly collapse a now-empty
1891 if (ib->depth > 1) {
1893 LAFSI(ib->b.inode)->depth = 1;
1894 lafs_clear_index(ib);
1896 /* Don't need to dirty the inode - the fact that we just
1897 * did incorporation should ensure it is already dirty
1901 if (ib->depth == 1 &&
1902 (lafs_leaf_next(ib, 0) != 0xFFFFFFFF ||
1903 !list_empty(&ib->children)))
1906 /* Block is really really empty */
1907 set_bit(B_EmptyIndex, &ib->b.flags);
1912 /***************************************************************
1913 * Space pre-allocation
1914 * We need to make sure that the block and all parents
1915 * have a full complement of space credits.
1916 * If this cannot be achieved, we can fail with either
1917 * -ENOSPC if we were allocating more space, or -EAGAIN.
1920 int __must_check lafs_prealloc(struct block *blk, int why)
1922 struct fs *fs = fs_from_inode(blk->inode);
1927 LAFS_BUG(why != CleanSpace && why != AccountSpace
1928 && !test_phase_locked(fs)
1929 && !fs->checkpointing, blk);
1936 if (!test_bit(B_Dirty, &b->flags)) {
1938 need += !test_bit(B_Credit, &b->flags);
1939 else if (!test_and_set_bit(B_Credit, &b->flags))
1942 if (!test_bit(B_UnincCredit, &b->flags)) {
1944 need += !test_bit(B_ICredit, &b->flags);
1945 else if (!test_and_set_bit(B_ICredit, &b->flags))
1948 /* We need N*Credits if this block might need to be
1949 * phase-flipped and remain pinned in the next
1950 * phase. i.e. index blocks and accounting blocks.
1952 if (test_bit(B_Index, &b->flags) ||
1953 LAFSI(b->inode)->type == TypeQuota ||
1954 LAFSI(b->inode)->type == TypeSegmentMap) {
1956 need += !test_bit(B_NCredit, &b->flags);
1957 else if (!test_and_set_bit(B_NCredit, &b->flags))
1960 need += !test_bit(B_NICredit, &b->flags);
1961 else if (!test_and_set_bit(B_NICredit, &b->flags))
1964 if (test_bit(B_InoIdx, &b->flags))
1965 b = &LAFSI(b->inode)->dblock->b;
1969 dprintk("need=%d credits=%d for %s\n", need, credits, strblk(blk));
1971 /* all needed credits allocated */
1972 lafs_space_return(fs, credits);
1976 if (lafs_space_alloc(fs, need, why)) {
1977 dprintk("Got them for %s\n", strblk(blk));
1978 /* We got the credits we asked for */
1982 lafs_space_return(fs, credits);