]> git.neil.brown.name Git - LaFS.git/blob - modify.c
README update
[LaFS.git] / modify.c
1 #include "lafs.h"
2
3 /*
4  * fs/lafs/modify.c
5  * Copyright (C) 2005-2009
6  * Neil Brown <neilb@suse.de>
7  * Released under the GPL, version 2
8  *
9  * Index incorporation.
10  *  We have an index block (indirect, extent, or index), which may contain
11  *  some addresses already.
12  *  It has an 'uninc_table' which is a short list of addresses to
13  *  be incorporated.
14  *
15  * Best option is that all the new addresses fit and we don't need to
16  * split or even change between indirect and extent.
17  * But some times format changing and splitting are needed.
18  *
19  * To handle format changing we generally create a new table and
20  * copy all data into it.  We maintain some fast paths for the
21  * really simple updates such as indirect when all new blocks fit in the
22  * range, or index where all new ranges come at the end, and fit.
23  *
24  * In general, the first result of incorporation is a new block
25  * with new addresses, a possibly empty old block, and a possibly empty
26  * list of addresses that didn't fit into the new block.
27  *
28  * If the old block and the list of addresses are empty, we simply
29  * swizzle the block pointers and are done.  If not, we have to split the
30  * index block and we need to inform the parent.  If there is no parent
31  * to inform (we are an inode) then we have to grow the depth of the
32  * index tree.
33  *
34  * Awkward details:
35  * - When an index block is split, we need to scan all children and
36  *   reparent those with a new parent.
37  * - If an index block becomes empty, we can mark it a Hole.
38  * - If the inode block has only one index, maybe we should
39  *   shrink the tree.
40  *
41  * Memory allocation issues.
42  *  We are potentially in the write path for freeing memory when
43  *  performing incorporation so we have to be careful about memory
44  *  allocations.
45  *  Sometimes we need to allocate a new page for a page split.  It seems
46  *  likely that the old page will be able to be written and freed soon
47  *  but there is no guarantee as new pages might be dirtied at any time
48  *  and so pin the index page back in memory.
49  *  The key to avoiding memory exhaustion is to block the dirtying of
50  *  new pages early enough... We will have to worry about that later.
51  */
52
53 static void sort_blocks(struct block **blkp)
54 {
55         /* sort blocks list in *blkp by ->fileaddr
56          * use merge sort
57          */
58         int cnt = 0;
59
60         struct block *bl[2];
61         bl[0] = *blkp;
62         bl[1] = NULL;
63
64         do {
65                 struct block **blp[2], *b[2];
66                 int curr = 0;
67                 long prev = 0;
68                 int next = 0;
69                 cnt++;
70                 blp[0] = &bl[0];
71                 blp[1] = &bl[1];
72                 b[0] = bl[0];
73                 b[1] = bl[1];
74
75                 /* take least of b[0] and b[1]
76                  * if it is larger than prev, add to
77                  * blp[curr], else swap curr then add
78                  */
79                 while (b[0] || b[1]) {
80                         if (b[next] == NULL ||
81                             (b[1-next] != NULL &&
82                              !((prev <= b[1-next]->fileaddr)
83                                ^(prev <= b[next]->fileaddr)
84                                ^(b[next]->fileaddr <= b[1-next]->fileaddr)))
85                                 )
86                                 next = 1 - next;
87
88                         if (b[next]->fileaddr < prev)
89                                 curr = 1 - curr;
90                         prev = b[next]->fileaddr;
91                         *blp[curr] = b[next];
92                         blp[curr] = &b[next]->chain;
93                         b[next] = b[next]->chain;
94                 }
95                 *blp[0] = NULL;
96                 *blp[1] = NULL;
97         } while (bl[0] && bl[1]);
98         if (bl[0])
99                 *blkp = bl[0];
100         else
101                 *blkp = bl[1];
102 }
103
104 static int incorp_index(struct block *b)
105 {
106         /* This is an index block which we have just incorporated.
107          * Update flags accordingly.
108          */
109         int cr = 0;
110         clear_bit(B_Uninc, &b->flags);
111         if (test_and_clear_bit(B_UnincCredit, &b->flags)) {
112                 temp_credits++;
113                 cr++;
114         }
115         if (test_and_clear_bit(B_PrimaryRef, &b->flags)) {
116                 struct block *pb = list_entry(b->siblings.prev,
117                                               struct block, siblings);
118                 LAFS_BUG(b->siblings.prev == &b->parent->children, b);
119                 putref(pb, MKREF(primary));
120                 if (!test_bit(B_EmptyIndex, &b->flags))
121                         lafs_hash_iblock(iblk(b));
122         }
123         putref(b, MKREF(uninc));
124         return cr;
125 }
126
127 static int incorporate_indirect(struct uninc *ui, char *buf, u32 addr, int len)
128 {
129         /* See if this uninc info can be incorporated directly into
130          * this indirect block, and if it can: do it.
131          */
132         int i;
133         int credits = 0;
134
135         if ( ui->pending_addr[ui->pending_cnt-1].fileaddr
136             +ui->pending_addr[ui->pending_cnt-1].cnt
137              > addr + (len/6))
138                 return 0;
139
140         for (i = 0; i < ui->pending_cnt; i++) {
141                 u32 ad = ui->pending_addr[i].fileaddr - addr;
142                 u32 cnt = ui->pending_addr[i].cnt;
143                 u64 phys = ui->pending_addr[i].physaddr;
144                 credits += cnt;
145                 while (cnt--) {
146                         char *p = buf + 6*ad;
147                         encode48(p, phys);
148                         ad++;
149                         phys++;
150                 }
151         }
152         return credits;
153 }
154
155 static int incorporate_extent(struct uninc *ui, char *buf, int size)
156 {
157         /* See if the common case of new data being added to the file
158          * applies.  i.e. is there enough space in the buf to add the
159          * extents listed in ui.  If so, add them. */
160         /* Find the last valid extent. */
161         int e = (size)/12;
162         int i;
163         u32 last = 0;
164         int credits = 0;
165
166         while (e > 0) {
167                 char *b;
168                 int size;
169                 b = buf + (e-1)*12 + 6;
170                 size = decode16(b);
171                 if (size != 0) {
172                         last = decode32(b)+size;
173                         break;
174                 }
175                 e--;
176         }
177         /* there are 'e' valid extents; */
178         if (size/12 - e < ui->pending_cnt
179             || ui->pending_addr[0].fileaddr < last)
180                 return 0;
181
182         /*Ok, they fit. */
183         buf += e*12;
184         for (i = 0; i < ui->pending_cnt; i++)
185                 if (ui->pending_addr[i].physaddr) {
186                         dprintk("AddedX %llu %u %lu\n",
187                                 (unsigned long long)ui->pending_addr[i].physaddr,
188                                 (unsigned) ui->pending_addr[i].cnt,
189                                 (unsigned long)ui->pending_addr[i].fileaddr);
190                         credits += ui->pending_addr[i].cnt;
191                         encode48(buf, ui->pending_addr[i].physaddr);
192                         encode16(buf, ui->pending_addr[i].cnt);
193                         encode32(buf, ui->pending_addr[i].fileaddr);
194                 }
195         return credits;
196 }
197
198 static int incorporate_index(struct block *uninc, char *buf, int size)
199 {
200         /* see if we can merge the addresses of uninc blocks
201          * with those in buf, by counting the total number,
202          * and if they fit, merge backwards.
203          * Return -1 on failure, or number of credits collected.
204          */
205         int ucnt = 0;
206         int icnt = size/10;
207         int ncnt;
208         int delcnt = 0;
209         int repcnt = 0;
210         int i;
211         char *b;
212         u32  uaddr, iaddr;
213         struct block *blk;
214         int credits;
215
216         while (icnt > 0) {
217                 u64 phys;
218                 b = buf + (icnt-1)*10;
219                 phys = decode48(b);
220                 if (phys != 0)
221                         break;
222                 icnt--;
223         }
224         for (blk = uninc ; blk ; blk = blk->chain)
225                 ucnt++;
226         i = 0;
227         ncnt = 0;
228         LAFS_BUG(ucnt == 0, uninc);
229         uaddr = uninc->fileaddr;
230         blk = uninc;
231
232         while (blk || i < icnt) {
233                 if (blk)
234                         uaddr = blk->fileaddr;
235                 else
236                         uaddr = 0xffffffff;
237                 if (i < icnt) {
238                         b = buf + i*10 + 6;
239                         iaddr = decode32(b);
240                 } else
241                         iaddr = 0xffffffff;
242
243                 if (uaddr == iaddr) {
244                         /* replacement */
245                         if (blk->physaddr)
246                                 ncnt++;
247                         else
248                                 delcnt++;
249                         repcnt++;
250                         i++;
251                         blk = blk->chain;
252                 } else if (uaddr < iaddr) {
253                         /* New entry, unlikely to have a phys of 0... */
254                         if (blk->physaddr)
255                                 ncnt++;
256                         else
257                                 delcnt++; /* not strictly a delete, but too
258                                            * confusing to handle.
259                                            */
260                         blk = blk->chain;
261                 } else {/* Old unchanged entry */
262                         ncnt++;
263                         i++;
264                 }
265         }
266         if (ncnt > (size/10))
267                 return -1;
268         if (repcnt == icnt) {
269                 /* all currently incorporated addresses are
270                  * replaced or removed, so just install the
271                  * new addresses
272                  */
273                 credits = 0;
274                 b = buf;
275                 i = 0;
276                 temp_credits = 0;
277                 while (uninc) {
278                         blk = uninc;
279                         uninc = uninc->chain;
280                         if (blk->physaddr) {
281                                 encode48(b, blk->physaddr);
282                                 encode32(b, blk->fileaddr);
283                                 i++;
284                         }
285                         credits += incorp_index(blk);
286                 }
287                 while (i < icnt) {
288                         encode48(b, 0ULL);
289                         encode32(b, 0);
290                         i++;
291                 }
292                 temp_credits = 0;
293                 return credits;
294         }
295         if (delcnt) {
296                 /* some addresses have been deleted.  Too hard to do
297                  * quickly
298                  */
299                 return -1;
300         }
301         /* OK, have n addresses, and we can write them from last
302          * to first without overwriting anything, so let's do it
303          */
304         blk = NULL;
305         while (uninc) {
306                 struct block *t = blk;
307                 blk = uninc;
308                 uninc = uninc->chain;
309                 blk->chain = t;
310         }
311         uninc = blk;
312         credits = 0;
313         temp_credits = 0;
314         while (i || uninc) {
315                 int drop_one = 0;
316                 if (uninc)
317                         uaddr = uninc->fileaddr;
318                 else
319                         uaddr = 0;
320                 if (i) {
321                         b = buf + (i-1)*10 + 6;
322                         iaddr = decode32(b);
323                 } else
324                         iaddr = 0;
325
326                 if (uaddr == iaddr && uaddr == 0) {
327                         /* '0' is both least value and end marker, so
328                          * be careful.  We know there is no deletion, so
329                          * ->physaddr cannot be zero.
330                          */
331                         ncnt--;
332                         if (uninc) {
333                                 BUG_ON(uninc->physaddr == 0);
334                                 b = buf + ncnt*10;
335                                 encode48(b, uninc->physaddr);
336                                 encode32(b, uaddr);
337                                 drop_one = 1;
338                         }
339                         if (i)
340                                 i--;
341                 } else if (uaddr == iaddr) {
342                         BUG_ON(uninc->physaddr == 0);
343                         ncnt--;
344                         b = buf + ncnt*10;
345                         encode48(b, uninc->physaddr);
346                         encode32(b, uaddr);
347                         i--;
348                         drop_one = 1;
349                 } else if (uaddr > iaddr) {
350                         if (uninc->physaddr) {
351                                 ncnt--;
352                                 b = buf + ncnt*10;
353                                 encode48(b, uninc->physaddr);
354                                 encode32(b, uaddr);
355                         }
356                         drop_one = 1;
357                 } else {
358                         u64 p;
359                         u32 a;
360                         i--; ncnt--;
361                         b = buf + i*10;
362                         p = decode48(b);
363                         a = decode32(b);
364                         b = buf + ncnt*10;
365                         encode48(b, p);
366                         encode32(b, a);
367                 }
368                 if (drop_one) {
369                         blk = uninc;
370                         uninc = uninc->chain;
371                         credits += incorp_index(blk);
372                 }
373         }
374         temp_credits = 0;
375         BUG_ON(ncnt);
376         return credits;
377 }
378
379 void lafs_clear_index(struct indexblock *ib)
380 {
381         int len = 1 << ib->b.inode->i_blkbits;
382         int offset = 0;
383         char *buf;
384         if (test_bit(B_InoIdx, &ib->b.flags))
385                 offset = LAFSI(ib->b.inode)->metadata_size;
386         buf = map_iblock(ib);
387         memset(buf+offset, 0, len - offset);
388
389         if (ib->depth == 1)
390                 *(u16 *)(buf+offset) = cpu_to_le16(IBLK_INDIRECT);
391         else if (ib->depth > 1)
392                 *(u16 *)(buf+offset) = cpu_to_le16(IBLK_INDEX);
393         else
394                 LAFS_BUG(1, &ib->b);
395        
396         if (offset) {
397                 /* just to be on the safe side ... */
398                 ((struct la_inode*)(buf))->depth = ib->depth;
399         }
400         unmap_iblock(ib, buf);
401         set_bit(B_Valid, &ib->b.flags);
402 }
403
404 static void grow_index_tree(struct indexblock *ib, struct indexblock *new)
405 {
406         /* leaf_incorporate or internal_incorporate was working on an inode
407          * and didn't find enough space.
408          * So demote 'ib' to a regular Index block and make 'new' a new
409          * InoIdx block (inode->iblock);
410          * We have IOLock on ib to ensure exclusivity.
411          */
412
413         int blksize = ib->b.inode->i_sb->s_blocksize;
414         int offset = LAFSI(ib->b.inode)->metadata_size;
415         char *ibuf;
416
417         ib->data = new->data;  new->data = NULL;
418         clear_bit(B_InoIdx, &ib->b.flags);
419         lafs_hash_iblock(ib);
420         set_bit(B_InoIdx, &new->b.flags);
421         if (test_and_clear_bit(B_Root, &ib->b.flags))
422                 set_bit(B_Root, &new->b.flags);
423
424         new->b.fileaddr = 0;
425         new->b.inode = ib->b.inode;
426         new->b.parent = ib->b.parent;
427         ib->b.parent = new;
428         getiref(new, MKREF(child));
429
430         clear_bit(B_PhysValid, &ib->b.flags);
431         ib->b.physaddr = 0;
432
433         ibuf = map_iblock(new);
434         memcpy(ib->data, ibuf + offset, blksize - offset);
435         memset(ib->data + blksize - offset, 0, offset);
436         LAFS_BUG(ib->depth == 0, &ib->b);
437         new->depth = ib->depth + 1;
438         LAFSI(new->b.inode)->depth++;
439         ((struct la_inode*)(ibuf))->depth = LAFSI(new->b.inode)->depth;
440
441         /* There is no need to set PrimaryRef as the first child in an
442          * index block is implicitly incorporated - at least enough to be
443          * found.  So the ->parent link holds our place in the tree
444          */
445         unmap_iblock(new, ibuf);
446
447         LAFS_BUG(LAFSI(new->b.inode)->iblock != ib, &ib->b);
448         LAFSI(new->b.inode)->iblock = new;
449         list_add(&ib->b.siblings, &new->children);
450         INIT_LIST_HEAD(&new->b.siblings);
451
452         LAFS_BUG(new->depth != LAFSI(new->b.inode)->depth, &ib->b);
453         lafs_clear_index(new);
454
455         /* Make sure pin count is correct */
456         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
457         set_bit(B_Pinned, &new->b.flags);
458         set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
459         atomic_set(&new->pincnt[!!test_bit(B_Phase1, &ib->b.flags)], 1);
460 }
461
462 static void print_index(char *buf, u32 start, int len)
463 {
464         int type = le16_to_cpu(*(u16 *)(buf));
465         char *p = buf+2;
466         u32 addr;
467         int size;
468         u64 phys;
469         len -= 2;
470
471         switch (type) {
472         case IBLK_INDIRECT:
473                 printk(" Indirect:\n");
474                 while (len >= 6) {
475                         phys = decode48(p);
476                         if (phys)
477                                 printk(" %u:%lu", start, (unsigned long)phys);
478                         start++;
479                         len -= 6;
480                 }
481                 printk("\n");
482                 break;
483         case IBLK_EXTENT:
484                 printk(" Extent:\n");
485                 while (len >= 12) {
486                         phys = decode48(p);
487                         size = decode16(p);
488                         addr = decode32(p);
489                         if (size)
490                                 printk(" %u-%u: %llu\n", addr, addr+size-1,
491                                        (unsigned long long) phys);
492                         len -= 12;
493                 }
494                 break;
495         case IBLK_INDEX:
496                 printk(" Index:\n");
497                 while (len >= 10) {
498                         phys = decode48(p);
499                         addr = decode32(p);
500                         if (phys)
501                                 printk(" %u:%lu  ", addr, (unsigned long)phys);
502                         len -= 10;
503                 }
504                 printk("\n");
505                 break;
506         }
507 }
508
509 void lafs_print_uninc(struct uninc *ui)
510 {
511         int i;
512         printk(" pending=%d\n", ui->pending_cnt);
513         for (i = 0; i < ui->pending_cnt; i++)
514                 printk("  [%d] %u-%u : %lu\n", i,
515                        ui->pending_addr[i].fileaddr,
516                        ui->pending_addr[i].fileaddr
517                        +ui->pending_addr[i].cnt - 1,
518                        (unsigned long)ui->pending_addr[i].physaddr);
519 }
520
521 struct layoutinfo {
522         char *data;
523         int size;
524         u32 nextaddr; /* on failure, set to the first address that doesn't fit */
525
526         u64 lastphys;
527         u32 lastaddr;
528         int esize;
529 };
530
531 static int add_index(void *data, u32 addr, u64 phys)
532 {
533         struct layoutinfo *li = data;
534         char *cp = li->data;
535
536         if (phys == 0) {
537                 /*initialise */
538                 encode16(cp, IBLK_INDEX);
539                 li->size -= 2;
540                 li->data = cp;
541                 return 0;
542         }
543         if (phys == ~0LL)
544                 return 0;
545
546         if (li->size < 10) {
547                 li->nextaddr = addr;
548                 return 0;
549         }
550
551         cp = li->data;
552         /* Each entry is 10 bytes: 6byte dev, 4byte file */
553         encode48(cp, phys);
554         encode32(cp, addr);
555
556         li->data = cp;
557         li->size -= 10;
558         return 1;
559 }
560
561 static int add_indirect(void *data, u32 addr, u64 phys, int len)
562 {
563         struct layoutinfo *li = data;
564         u32 lastaddr;
565         char *p;
566         int i;
567
568         if (phys == 0) {
569                 /* initialise */
570                 li->lastaddr = addr;
571                 p = li->data;
572                 encode16(p, IBLK_INDIRECT);
573                 li->size -= 2;
574                 li->data = p;
575                 return 0;
576         }
577         if (len == 0)
578                 return 0; /* finalise */
579
580         p = li->data + (addr - li->lastaddr) * 6;
581         lastaddr = li->lastaddr + (li->size/6);
582         for (i = 0; i < len && addr < lastaddr ; i++) {
583                 encode48(p, phys);
584                 addr++;
585                 phys++;
586         }
587         if (i < len)
588                 li->nextaddr = addr;
589         return i;
590 }
591
592 static int add_extent(void *data, u32 addr, u64 phys, int len)
593 {
594         struct layoutinfo *li = data;
595         char *p;
596
597         if (phys == 0) {
598                 /* initialise */
599                 p = li->data;
600                 encode16(p, IBLK_EXTENT);
601                 li->size -= 2;
602                 li->data = p;
603                 li->esize = 0;
604                 return 0;
605         }
606         if (len == 0) {
607                 /* finalise */
608                 /* close off an extent if there is one */
609                 dprintk("Finalise: %d\n", (int)li->esize);
610                 if (li->esize) {
611                         p = li->data - 6;
612                         encode16(p, li->esize);
613                         li->esize = 0;
614                 }
615                 return 0;
616         }
617
618         if (li->esize && li->lastaddr == addr &&
619             li->lastphys == phys) {
620                 /* just extend the extent */
621                 dprintk("Extending %llu %u %lu\n",
622                         (unsigned long long)phys,
623                         (unsigned) len,
624                         (unsigned long)addr);
625                 li->esize += len;
626                 li->lastaddr += len;
627                 li->lastphys += len;
628                 return len;
629         }
630         if (li->esize) {
631                 /* need to close old extent */
632                 p = li->data - 6;
633                 encode16(p, li->esize);
634                 li->esize = 0;
635         }
636         if (li->size < 12) {
637                 /* No room */
638                 dprintk("Extent says 'no room'\n");
639                 li->nextaddr = li->lastaddr;
640                 return 0;
641         }
642         p = li->data;
643         encode48(p, phys);
644         encode16(p, len);
645         encode32(p, addr);
646         dprintk("Added %llu %u %lu\n",
647                 (unsigned long long)phys,
648                 (unsigned) len,
649                 (unsigned long)addr);
650         li->esize = len;
651         li->data = p;
652         li->size -= 12;
653         li->lastaddr = addr + len;
654         li->lastphys = phys + len;
655         return len;
656 }
657
658 struct leafinfo {
659         u32     firstaddr;
660         u32     nextaddr;
661         u64     nextphys;
662         int     extents;
663         int     esize;
664         int     choice;
665         int     nofit; /* bit mask of which layouts don't fit */
666         int     blksize;
667         int     cnt;
668 };
669
670 static int check_leaf(void *data, u32 addr, u64 phys, int len)
671 {
672         struct leafinfo *li = data;
673         int origlen = len;
674
675         /* We are gathering data to see whether an indirect block or
676          * an extent block is best - depending on which fills the block
677          * first (that one is not good choice).
678          * We also see if either will fit at all.
679          * For indirect, we give up as soon as an address will not fit in
680          *  the block.
681          * For extent, we track how many extents will be needed, and give up
682          *  when that exceeds the size of the block.
683          */
684
685         if (phys == 0) {
686                 /* We are initialising the structure */
687                 memset(li, 0, sizeof(*li));
688                 li->firstaddr = addr;
689                 li->blksize = len - 2; /* exclude type field */
690                 return 0;
691         }
692         if (len == 0)
693                 return 0; /* nothing to finalise */
694
695         li->cnt++;
696         /* Check for indirect layout */
697         if (((addr + len) - li->firstaddr) * 6 > li->blksize) {
698                 /* Indirect no longer fits */
699                 if (!li->choice)
700                         li->choice = IBLK_EXTENT;
701                 li->nofit |= (1 << IBLK_INDIRECT);
702         }
703         /* Check for extent layout */
704         if (addr == li->nextaddr && phys == li->nextphys) {
705                 /* This fits in the current extent, at least partly */
706                 int size = len;
707                 if (li->esize + size >= 0x10000) {
708                         /* extent too large */
709                         len = 0x10000 - li->esize - 1;
710                 }
711                 li->esize += size;
712                 li->nextaddr += size;
713                 li->nextphys += size;
714                 len -= size;
715                 addr += size;
716                 phys += size;
717         }
718         if (len) {
719                 /* need a new extent */
720                 li->extents++;
721                 if (li->extents * 12 > li->blksize) {
722                         /* But it won't fit */
723                         if (!li->choice)
724                                 li->choice = IBLK_INDIRECT;
725                         li->nofit |= (1 << IBLK_EXTENT);
726                 }
727                 li->nextaddr = addr+1;
728                 li->nextphys = phys+1;
729                 li->esize = len;
730         }
731         if (li->choice &&
732             (li->nofit & (1 << IBLK_INDIRECT)) &&
733             (li->nofit & (1 << IBLK_EXTENT)))
734                 /* stop looking */
735                 return 0;
736         return origlen;
737 }
738
739 static u32 walk_index(u32 addr, char **bufp, int len, struct block *uninc,
740                       int (*handle)(void*, u32, u64),
741                       void *data)
742 {
743         /* Pass each entry in buf or uninc to "handle" (which is
744          * always add_index).
745          * Must pass addresses in order.
746          * 'handle' returns 1 if the address was added.
747          * We return 1 if all addressed handled, else 0.
748          *
749          * Entries are 10 bytes: 6 byte dev address, 4 byte file address.
750          */
751         char *buf = *bufp;
752         int found_end = 0;
753
754         handle(data, addr, 0); /* initialise */
755
756         while ((len >= 10 && !found_end) || uninc != NULL) {
757                 unsigned long addr = 0;
758                 u64 phys = 0;
759
760                 if (len >= 10) {
761                         phys = decode48(buf);
762                         addr = decode32(buf);
763                         len -= 10;
764                         if (phys == 0)
765                                 found_end = 1;
766                 }
767
768                 while (uninc &&
769                        (phys == 0 || uninc->fileaddr <= addr)) {
770                         /* New block at or before current address */
771                         if (uninc->physaddr &&
772                             !handle(data, uninc->fileaddr, uninc->physaddr)) {
773                                 if (addr > uninc->fileaddr)
774                                         buf -= 10;
775                                 *bufp = buf;
776                                 return 0;
777                         }
778                         if (uninc->fileaddr == addr)
779                                 /* Don't record the old value from the index */
780                                 phys = 0;
781                         uninc = uninc->chain;
782                 }
783
784                 if (phys && !handle(data, addr, phys)) {
785                         *bufp = buf - 10;
786                         return 0;
787                 }
788         }
789         handle(data, 0, ~(u64)0); /* finalise */
790         return 1;
791 }
792
793 static u32 walk_indirect(u32 addr, char **bufp, int len, struct uninc *ui,
794                          int (*handle)(void*, u32, u64, int),
795                          void *data)
796 {
797         /* Pass each entry in buf or ui to "handle".
798          * Must pass addresses in order.
799          * 'handle' returns number of addresses handled.
800          * If this isn't all, we stop and return 0
801          * else 1 if all are handled.
802          */
803         int uinum = 0;
804         int uioffset = 0;
805         char *buf = *bufp;
806
807         handle(data, addr, 0, len); /* initialise */
808
809         while (len >= 6 || uinum < ui->pending_cnt) {
810                 u64 phys = 0;
811
812                 if (len >= 6) {
813                         phys = decode48(buf);
814                         len -= 6;
815                 } else if (uinum < ui->pending_cnt &&
816                            addr < ui->pending_addr[uinum].fileaddr + uioffset)
817                         /* Skip ahead quickly. */
818                         addr = ui->pending_addr[uinum].fileaddr + uioffset;
819
820                 if (uinum < ui->pending_cnt &&
821                     ui->pending_addr[uinum].fileaddr + uioffset == addr) {
822                         phys = ui->pending_addr[uinum].physaddr + uioffset;
823                         uioffset++;
824                         if (uioffset >= ui->pending_addr[uinum].cnt) {
825                                 uinum++;
826                                 uioffset = 0;
827                         }
828                         /* FIXME what if pending address ranges overlap !! */
829                         /* FIXME should an pending_addr range of holes be allowed?*/
830                         /* FIXME should we check that addr is never > fileaddr+offset ? */
831                 }
832                 if (phys) {
833                         if (!handle(data, addr, phys, 1)) {
834                                 *bufp = buf - 6;
835                                 return 0;
836                         }
837                 }
838                 addr++;
839         }
840
841         handle(data, 0, ~(u64)0, 0); /* finalise */
842         return 1;
843 }
844
845 static u32 walk_extent(u32 addr, char **bufp, int len, struct uninc *ui,
846                        int (*handle)(void*, u32, u64, int),
847                        void *data)
848 {
849         /* pass each extent in buf or ui to handle.  Extents
850          * can overlap, so care is needed
851          */
852         int uinum = 0;
853         int uioffset = 0;
854         u64 ephys = 0;
855         u32 eaddr = 0;
856         int elen = 0;
857         int found_end = 0;
858         int handled;
859         char *buf = *bufp;
860
861         handle(data, addr, 0, len); /* initialise */
862
863         while (uinum < ui->pending_cnt || ! found_end) {
864                 if (elen == 0 && !found_end) {
865                         if (len >= 12) {
866                                 ephys = decode48(buf);
867                                 elen = decode16(buf);
868                                 eaddr = decode32(buf);
869                                 len -= 12;
870                                 BUG_ON(ephys == 0 && elen != 0); // FIXME fail gracefully
871                                 dprintk("Found %llu %u %lu at %d\n",
872                                         (unsigned long long) ephys,
873                                         (unsigned) elen,
874                                         (unsigned long)eaddr, len);
875                                 if (ephys == 0) {
876                                         eaddr = 0xFFFFFFFFUL;
877                                         found_end = 1;
878                                 }
879                         } else {
880                                 eaddr = 0xFFFFFFFFUL;
881                                 elen = 0;
882                                 found_end = 1;
883                         }
884                 }
885
886                 /* Handle all pending extents that start before (or at) here */
887                 while (uinum < ui->pending_cnt &&
888                        ui->pending_addr[uinum].fileaddr + uioffset <= eaddr &&
889                        (elen > 0 || eaddr == 0xFFFFFFFFUL)
890                         ) {
891                         /* just submit next extent from ui */
892                         s32 overlap;
893                         int hlen = ui->pending_addr[uinum].cnt - uioffset;
894                         /* If there is an overlap with the current extent, only
895                          * add the overlapped portion.
896                          */
897                         dprintk("(%llu %lu %u) (%llu %lu %u) + %d = %d (%d)\n",
898                                 ephys, (unsigned long)eaddr, elen,
899                                 ui->pending_addr[uinum].physaddr,
900                                 (unsigned long)ui->pending_addr[uinum].fileaddr,
901                                 ui->pending_addr[uinum].cnt,
902                                 uioffset, hlen, uinum);
903                         if (elen &&
904                             ui->pending_addr[uinum].fileaddr + uioffset + hlen
905                             > eaddr + elen)
906                                 hlen = (eaddr + elen) -
907                                         (ui->pending_addr[uinum].fileaddr + uioffset);
908                         if (ui->pending_addr[uinum].physaddr)
909                                 handled = handle(data,
910                                                  ui->pending_addr[uinum].fileaddr + uioffset,
911                                                  ui->pending_addr[uinum].physaddr + uioffset,
912                                                  hlen);
913                         else
914                                 handled = hlen;
915                         /* That might replace some of current extent */
916                         overlap = (ui->pending_addr[uinum].fileaddr + uioffset +
917                                    handled) - eaddr;
918                         if (elen && overlap > 0) {
919                                 if (overlap > elen)
920                                         overlap = elen;
921                                 elen -= overlap;
922                                 eaddr += overlap;
923                                 ephys += overlap;
924                         }
925                         if (handled < hlen) {
926                                 if (elen)
927                                         *bufp = buf - 12;
928                                 else
929                                         *bufp = buf;
930                                 return 0;
931                         }
932                         uioffset += hlen;
933                         if (uioffset >= ui->pending_addr[uinum].cnt) {
934                                 uinum++;
935                                 uioffset = 0;
936                         }
937                 }
938                 BUG_ON(elen && uioffset);
939                 /* If uninc extent intersects this extent, just submit
940                  * partial extent and loop
941                  */
942                 if (elen && uinum < ui->pending_cnt &&
943                     ui->pending_addr[uinum].fileaddr < eaddr + elen) {
944                         int elen2 = ui->pending_addr[uinum].fileaddr - eaddr;
945                         handled = handle(data, eaddr, ephys, elen2);
946                         if (handled < elen2) {
947                                 *bufp = buf - 12;
948                                 return 0;
949                         }
950                         eaddr += elen2;
951                         ephys += elen2;
952                         elen -= elen2;
953                 } else if (elen) {
954                         /* Ok, just submit this extent */
955                         handled = handle(data, eaddr, ephys, elen);
956                         if (handled < elen) {
957                                 *bufp = buf - 12;
958                                 return 0;
959                         }
960
961                         elen = 0;
962                 }
963         }
964         *bufp = buf;
965         handle(data, 0, ~(u64)0, 0); /* finalise */
966         return 1;
967 }
968
969 void lafs_walk_leaf_index(struct indexblock *ib,
970                           int (*handle)(void*, u32, u64, int),
971                           void *data)
972 {
973         char *ibuf, *buf;
974         int offset;
975         int len;
976         int current_layout;
977         struct fs *fs = fs_from_inode(ib->b.inode);
978         struct uninc ui;
979         ui.pending_cnt = 0;
980
981         /* Cannot use kmap_atomic as handle might sleep when
982          * looking up segusage blocks
983          */
984         if (test_bit(B_InoIdx, &ib->b.flags)) {
985                 ibuf = kmap(LAFSI(ib->b.inode)->dblock->page);
986                 ibuf += dblock_offset(LAFSI(ib->b.inode)->dblock);
987         } else
988                 ibuf = map_iblock(ib);
989         if (test_bit(B_InoIdx, &ib->b.flags))
990                 offset = LAFSI(ib->b.inode)->metadata_size;
991         else
992                 offset = 0;
993         len = fs->blocksize - offset;
994
995         current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
996         buf = ibuf + offset + 2;
997         dprintk("CURRENT=%d\n", current_layout);
998         switch (current_layout) {
999         case IBLK_INDIRECT:
1000                 walk_indirect(ib->b.fileaddr,
1001                               &buf, len-2, &ui,
1002                               handle, data);
1003                 break;
1004         case IBLK_EXTENT:
1005                 walk_extent(ib->b.fileaddr,
1006                             &buf, len-2, &ui,
1007                             handle, data);
1008                 break;
1009         default:
1010                 printk("CURRENT=%d\n", current_layout);
1011                 LAFS_BUG(1, &ib->b); // FIXME should be IO error ??
1012         }
1013         if (test_bit(B_InoIdx, &ib->b.flags))
1014                 kunmap(LAFSI(ib->b.inode)->dblock->page);
1015 }
1016
1017 static void share_list(struct block **ibp, struct block **newp, u32 next)
1018 {
1019         struct block *uin = *ibp;
1020         *ibp = NULL;
1021         while (uin) {
1022                 struct block *b = uin;
1023                 uin = b->chain;
1024
1025                 if (b->fileaddr < next) {
1026                         *ibp = b;
1027                         ibp = &b->chain;
1028                 } else {
1029                         *newp = b;
1030                         newp = &b->chain;
1031                 }
1032         }
1033         *ibp = NULL;
1034         *newp = NULL;
1035 }
1036
1037 static void share_uninc(struct uninc *from, struct uninc *to,
1038                          u32 next)
1039 {
1040         /* Any addresses in 'from' that are at-or-after 'next' go to 'to'.
1041          * Credits need to be shared somehow too...
1042          */
1043         int i, j;
1044         struct addr *ia, *ja;
1045         u32 len;
1046         int moved = 0;
1047
1048         for (i = 0; i < from->pending_cnt; i++) {
1049                 ia = &from->pending_addr[i];
1050                 if (ia->fileaddr +
1051                     ia->cnt <= next)
1052                         /* This one stays put */
1053                         continue;
1054                 if (ia->fileaddr < next) {
1055                         /* This one gets split */
1056                         j = to->pending_cnt;
1057                         ja = &to->pending_addr[j];
1058                         len = next - ia->fileaddr;
1059
1060                         ja->fileaddr = next;
1061                         ja->cnt = ia->cnt - len;
1062                         ja->physaddr = ia->physaddr + len;
1063                         moved += ia->cnt - len;
1064
1065                         ia->cnt = len;
1066                         to->pending_cnt = j+1;
1067                         continue;
1068                 }
1069                 /* This one gets moved across. */
1070                 j = to->pending_cnt;
1071                 ja = &to->pending_addr[j];
1072                 *ja = *ia;
1073                 to->pending_cnt = j+1;
1074                 moved = ia->cnt;
1075
1076                 /* Move the last one into this spot, and try again */
1077                 j = from->pending_cnt-1;
1078                 ja = &from->pending_addr[j];
1079                 *ia = *ja;
1080                 from->pending_cnt = j;
1081                 i--;
1082         }
1083         /* FIXME this doesn't make sense.
1084          * if the test can fail, we must have a better answer
1085          */
1086         if (from->credits >= moved) {
1087                 to->credits += moved;
1088                 from->credits -= moved;
1089         } else {
1090                 int c = from->credits / 2;
1091                 to->credits += c;
1092                 from->credits -= c;
1093                 BUG();
1094         }
1095 }
1096
1097 static void share_children(struct indexblock *ib, struct indexblock *new)
1098 {
1099         /* Some of the children for *ib must now become children
1100          * of *new.  When moving children we must be careful that
1101          * blocks with B_PrimaryRef stay with their primary,
1102          * or drop the flag and the reference
1103          */
1104         struct block *b, *tmp;
1105
1106         list_for_each_entry_safe(b, tmp, &ib->children, siblings) {
1107                 if (b->fileaddr == new->b.fileaddr &&
1108                     test_bit(B_Index, &b->flags) &&
1109                     test_and_clear_bit(B_PrimaryRef, &b->flags)) {
1110                         /* This block had a 'Primary' reference on
1111                          * the previous block.  We must drop that
1112                          * before it can be moved.
1113                          */
1114                         putref(list_entry(b->siblings.prev,
1115                                           struct block, siblings),
1116                                MKREF(primary));
1117                 }
1118                 if (b->fileaddr >= new->b.fileaddr) {
1119                         list_move_tail(&b->siblings, &new->children);
1120                         b->parent = new;
1121                         getiref(new, MKREF(child));
1122                         if (test_bit(B_Pinned, &b->flags)) {
1123                                 int ph = test_bit(B_Phase1, &b->flags);
1124                                 atomic_inc(&new->pincnt[ph]);
1125                                 atomic_dec(&ib->pincnt[ph]);
1126                         }
1127                         putiref(ib, MKREF(child));
1128                 }
1129         }
1130 }
1131
1132 static int do_incorporate_leaf(struct fs *fs, struct indexblock *ib,
1133                                struct uninc *ui,
1134                                struct indexblock *new)
1135 {
1136         /* Incorporate all the changes in ib->uninc_table into ib,
1137          * with any new address appearing in new.
1138          * Return value is one of:
1139          * 0: uninc_table removed everything, block is now empty.
1140          * 1: everything fits in 'ib'.  New can be discarded.  uninc_table
1141          *    is empty.
1142          * 2: the addresses are in ib, new, and possibly new->uninc_table.
1143          *    'new' has ->fileaddr set.
1144          * 3: ib is InoIdx and the addresses didn't all fit, so everything
1145          *    was copied to new and ib is now an IBLK_INDEX block pointing
1146          *    at new.  'new' still needs incorporation.
1147          *
1148          * We first simulate laying out the info in both
1149          * Indirect and Extent layouts and choose the best.
1150          * Then we perform the layout into new. Then we swap data
1151          * between ib and new as appropriate and return the result.
1152          */
1153         char *ibuf, *nbuf;
1154         char *buf, *b2, *sbuf;
1155         int offset;
1156         int len, slen;
1157         struct leafinfo leafinfo;
1158         int current_layout;
1159         int choice;
1160         struct layoutinfo layout;
1161         u32 next;
1162         int uinxt, uinum;
1163         int ok;
1164
1165         u32 eaddr;
1166         u64 ephys;
1167         int elen;
1168
1169         ibuf = map_iblock(ib);
1170         if (test_bit(B_InoIdx, &ib->b.flags))
1171                 offset = LAFSI(ib->b.inode)->metadata_size;
1172         else
1173                 offset = 0;
1174         len = fs->blocksize - offset;
1175
1176         check_leaf(&leafinfo, ib->b.fileaddr, 0, len);
1177
1178         current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
1179         buf = ibuf + offset + 2;
1180         switch (current_layout) {
1181         case IBLK_INDIRECT:
1182                 walk_indirect(ib->b.fileaddr,
1183                               &buf, len-2, ui,
1184                               check_leaf, &leafinfo);
1185                 break;
1186         case IBLK_EXTENT:
1187                 walk_extent(ib->b.fileaddr,
1188                             &buf, len-2, ui,
1189                             check_leaf, &leafinfo);
1190                 break;
1191         default:
1192                 printk("Current is %d\n", current_layout);
1193                 LAFS_BUG(1, &ib->b); // FIXME should be IO error ??
1194         }
1195
1196         if (leafinfo.cnt == 0) {
1197                 /* no addresses need to be stored here. */
1198                 unmap_iblock(ib, ibuf);
1199                 return 0;
1200         }
1201         choice = leafinfo.choice ?: IBLK_EXTENT; /* default to Extents */
1202         if (offset && (leafinfo.nofit & (1<<choice))) {
1203                 /* addresses won't fit in this inode, so copy into new,
1204                  * and set inode up as an index block
1205                  */
1206                 unmap_iblock(ib, ibuf);
1207                 grow_index_tree(ib, new);
1208                 return 3;
1209         }
1210
1211         nbuf = map_iblock_2(new);
1212         layout.data = nbuf;
1213         layout.size = len;
1214         memset(nbuf, 0, fs->blocksize);
1215         set_bit(B_Valid, &new->b.flags);
1216
1217         buf = ibuf + offset + 2;
1218         switch (current_layout) {
1219         case IBLK_INDIRECT:
1220                 ok = walk_indirect(ib->b.fileaddr,
1221                                    &buf, len-2, ui,
1222                                    (choice == IBLK_EXTENT)
1223                                    ? add_extent : add_indirect,
1224                                    &layout);
1225                 break;
1226         case IBLK_EXTENT:
1227                 ok = walk_extent(ib->b.fileaddr,
1228                                  &buf, len-2, ui,
1229                                  (choice == IBLK_EXTENT)
1230                                  ? add_extent : add_indirect,
1231                                  &layout);
1232                 break;
1233         default: BUG();
1234         }
1235         dprintk("walk_leaf only got as far as %d\n", (int)layout.nextaddr);
1236         // print_index(ibuf+offset, ib->b.fileaddr, len);
1237         if (ok) {
1238                 /* it all fit perfectly.
1239                  * Copy from 'new' into 'ib' and zero the uninc list
1240                  */
1241                 //printk("Offset=%d len=%d\n", offset, len);
1242                 memcpy(ibuf+offset, nbuf, len);
1243                 unmap_iblock_2(new, nbuf);
1244                 unmap_iblock(ib, ibuf);
1245                 return 1;
1246         }
1247         next = layout.nextaddr;
1248         LAFS_BUG(offset, &ib->b);
1249
1250         /* It didn't fit, and this is a regular index block, so
1251          * we need to do a horizontal split.
1252          * 'new' already has the early addresses.
1253          * All addresses before 'next' in 'ui' and 'ib' need to be
1254          * cleared out.
1255          * new->fileaddr must be set to 'next'.
1256          * data buffers for new and ib need to be swapped
1257          */
1258
1259         /* There is nowhere that we can safely put any index info
1260          * that is still in 'ui' except into the new block with the
1261          * remains of the 'ib' addresses.
1262          * It had better fit.  And it will.
1263          * Here are the cases for current_layout and choice.
1264          * INDIRECT -> INDIRECT
1265          *  The start of ib hasn't changed, so all the addresses that
1266          *  were there are now in new.  So ib is empty and can fit everything
1267          *  in ui, as ui is <<< ib
1268          *
1269          * INDIRECT -> EXTENT
1270          *  More fitted as EXTENTS, so we must have included all remaining
1271          *  addressed from ib, so only some ui addresses remain.
1272          * EXTENT -> EXTENT
1273          *  If extents have been added into new, there can have been at most
1274          *  two for each extent in ui (the new extent, and the extra piece from
1275          *  a split extent).  So worst case there are 16 extents left to encode,
1276          *  twice the number in ui.
1277          * EXTENT -> INDIRECT
1278          *  The INDIRECT must be storing more addresses than the EXTENT option,
1279          *  so there is even less remainder to be encoded.
1280          *
1281          * In each case, the remainder of ib must use less than half the space,
1282          * and it must be in EXTENT format if it is not empty.
1283          * So we can move them up to the top of the buffer, then merge with
1284          * anything remaining in ui.
1285          */
1286
1287         /* Shuffle remaining extents in ui down so they are easy to access */
1288         uinxt = 0;
1289         for (uinum = 0; uinum < ui->pending_cnt; uinum++) {
1290                 if (ui->pending_addr[uinum].cnt == 0)
1291                         continue;
1292                 if (ui->pending_addr[uinum].fileaddr
1293                     + ui->pending_addr[uinum].cnt < next)
1294                         continue;
1295                 if (ui->pending_addr[uinum].fileaddr < next) {
1296                         int cnt = ui->pending_addr[uinum].cnt
1297                                 - (next - ui->pending_addr[uinum].fileaddr);
1298                         ui->pending_addr[uinxt].physaddr =
1299                                 ui->pending_addr[uinum].physaddr
1300                                 + (next - ui->pending_addr[uinum].fileaddr);
1301                         ui->pending_addr[uinxt].fileaddr = next;
1302                         ui->pending_addr[uinxt].cnt = cnt;
1303                 } else {
1304                         ui->pending_addr[uinxt].fileaddr =
1305                                 ui->pending_addr[uinum].fileaddr;
1306                         ui->pending_addr[uinxt].physaddr =
1307                                 ui->pending_addr[uinum].physaddr;
1308                         ui->pending_addr[uinxt].cnt =
1309                                 ui->pending_addr[uinum].cnt;
1310                 }
1311                 uinxt++;
1312         }
1313         ui->pending_cnt = uinxt;
1314
1315         switch (current_layout) {
1316         case IBLK_INDIRECT:
1317                 /* buf must now be effectively empty, and there must
1318                  * be at least one extent still in ui.
1319                  * So zero the buff and encode those extents as IBLK_EXTENT
1320                  */
1321                 LAFS_BUG(next < ib->b.fileaddr + (len-2)/6, &ib->b);
1322                 LAFS_BUG(ui->pending_cnt == 0, &ib->b);
1323                 memset(ibuf, 0, len);
1324                 sbuf = NULL;
1325                 slen = 0;
1326                 break;
1327         case IBLK_EXTENT:
1328                 /* The combination of the remaining extents in buf, and
1329                  * those in ui will not fill buf.
1330                  * So shift the buf extents up to the top of the buffer,
1331                  * then merge the remaining ui extents in.
1332                  * The first extent may be partially processed already, so
1333                  * we might need to adjust it.
1334                  */
1335                 /* Find end of index list.  Could use a binary search,
1336                  * but not now.
1337                  */
1338                 b2 = buf;
1339                 while (b2 + 12 <= ibuf + len) {
1340                         ephys = decode48(b2);
1341                         elen = decode16(b2);
1342                         eaddr = decode32(b2);
1343                         if (elen == 0) {
1344                                 b2 -= 12;
1345                                 break;
1346                         }
1347                         if (eaddr < next) {
1348                                 int handled = next - eaddr;
1349                                 BUG_ON(eaddr + elen <= next);
1350                                 b2 -= 12;
1351                                 BUG_ON(b2 != buf);
1352                                 encode48(b2, ephys + handled);
1353                                 encode16(b2, elen - handled);
1354                                 encode32(b2, next);
1355                         }
1356                 }
1357                 /* Data we want extends from buf up to b2
1358                  * Move it to end of ibuf
1359                  */
1360                 slen = b2 - buf;
1361                 sbuf = ibuf + len - slen;
1362                 memmove(sbuf, buf, slen);
1363                 break;
1364         default:
1365                 LAFS_BUG(1, &ib->b);
1366         }
1367         layout.data = ibuf;
1368         layout.size = len;
1369         ok = walk_extent(next, &sbuf, slen, ui, add_extent, &layout);
1370         LAFS_BUG(!ok, &ib->b);
1371         if (slen && layout.data > sbuf) {
1372                 printk("slen=%d ld-sb=%d layout.data=%p sbuf=%p "
1373                        "buf=%p ibuf=%p len=%d\n",
1374                        slen, (int)(layout.data-sbuf), layout.data, sbuf,
1375                        buf, ibuf, len);
1376         }
1377         LAFS_BUG(slen && layout.data > sbuf, &ib->b);
1378         memset(layout.data, 0, layout.size);
1379
1380         new->b.fileaddr = next;
1381         new->data = ibuf;
1382         ib->data = nbuf;
1383         unmap_iblock_2(new, nbuf);
1384         unmap_iblock(ib, ibuf);
1385
1386         /* new needs to be a sibling of ib, and children need to be shared out,
1387          *   distributing pincnt appropriately.
1388          * uninc_next need to be shared too
1389          */
1390         share_children(ib, new);
1391
1392         spin_lock(&ib->b.inode->i_data.private_lock);
1393         if (ib->uninc_next)
1394                 share_list(&ib->uninc_next, &new->uninc_next, next);
1395         new->uninc_table.pending_cnt = 0;
1396         new->uninc_table.credits = 0;
1397         share_uninc(&ib->uninc_table, &new->uninc_table, next);
1398         spin_unlock(&ib->b.inode->i_data.private_lock);
1399
1400         new->depth = ib->depth;
1401         if (ib->b.siblings.next != &ib->b.parent->children &&
1402             test_bit(B_PrimaryRef, &list_entry(ib->b.siblings.next,
1403                                                struct block, siblings)->flags))
1404                 /* Inserting into a PrimaryRef chain */
1405                 getiref(new, MKREF(primary));
1406         else
1407                 getiref(ib, MKREF(primary));
1408         set_bit(B_PrimaryRef, &new->b.flags);
1409         list_add(&new->b.siblings, &ib->b.siblings);
1410         new->b.parent = ib->b.parent;
1411         (void)getiref(new->b.parent, MKREF(child));
1412         new->b.inode = ib->b.inode;
1413         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
1414         set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
1415         return 2;
1416 }
1417
1418 static int do_incorporate_internal(struct fs *fs, struct indexblock *ib,
1419                                    struct block *uninc, int *credits,
1420                                    struct indexblock *new)
1421 {
1422         /* Incorporate all the changes in 'uninc' into ib,
1423          * with any new address appearing in new.
1424          * Return value is one of:
1425          * 0: uninc removed everything, block is now empty.
1426          * 1: everything fits in 'ib'.  New can be discarded.  uninc_table
1427          *    is empty.
1428          * 2: the addresses are in ib, new, and possibly new->uninc
1429          *    'new' has ->fileaddr set.
1430          * 3: ib is InoIdx and the addresses didn't all fit, so everything
1431          *    was copied to new and ib is now an IBLK_INDEX block pointing
1432          *    at new.  'new' still needs incorporation.
1433          *
1434          */
1435         char *ibuf, *nbuf;
1436         char *buf;
1437         int offset;
1438         int len;
1439         int current_layout;
1440         struct layoutinfo layout;
1441         u32 next = 0;
1442         int ok;
1443         int cr;
1444
1445         ibuf = map_iblock(ib);
1446         if (test_bit(B_InoIdx, &ib->b.flags))
1447                 offset = LAFSI(ib->b.inode)->metadata_size;
1448         else
1449                 offset = 0;
1450         len = fs->blocksize - offset;
1451
1452         current_layout = le16_to_cpu(*(u16 *)(ibuf+offset));
1453         buf = ibuf + offset + 2;
1454         LAFS_BUG(current_layout != IBLK_INDEX,
1455                  &ib->b);
1456
1457         nbuf = map_iblock_2(new);
1458         layout.data = nbuf;
1459         layout.size = len;
1460         memset(nbuf, 0, fs->blocksize);
1461
1462         ok = walk_index(ib->b.fileaddr,
1463                         &buf, len-2, uninc,
1464                         add_index,
1465                         &layout);
1466
1467         dprintk("walk_index only got as far as %d\n", (int)next);
1468
1469         if (ok) {
1470                 /* it all fit perfectly.
1471                  * Copy from 'new' into 'ib' and free all uninc blocks
1472                  */
1473                 //printk("Offset=%d len=%d\n", offset, len);
1474                 memcpy(ibuf+offset, nbuf, len);
1475                 unmap_iblock_2(new, nbuf);
1476                 unmap_iblock(ib, ibuf);
1477                 cr = 0;
1478                 temp_credits = 0;
1479                 while (uninc) {
1480                         struct block *b = uninc;
1481                         uninc = uninc->chain;
1482                         cr += incorp_index(b);
1483                 }
1484                 temp_credits = 0;
1485                 *credits += cr;
1486                 return 1;
1487         }
1488         if (offset) {
1489                 unmap_iblock(ib, ibuf);
1490                 grow_index_tree(ib, new);
1491                 return 3;
1492         }
1493         next = layout.nextaddr;
1494         /* It didn't fit, and this is a regular index block, so
1495          * we need to do a horizontal split.
1496          * 'new' already has the early addresses.
1497          * All addresses before 'next' in 'ib' need to be
1498          * cleared out.
1499          * new->fileaddr must be set to 'next'.
1500          * data buffers for new and ib need to be swapped
1501          */
1502
1503         /* shuffle ibuf down.
1504          * 'buf' is the starting point.
1505          */
1506         memcpy(ibuf+2, buf,  ibuf + len - buf);
1507         memset(ibuf+2 + len - (buf - ibuf), 0, buf - (ibuf+2));
1508
1509         new->b.fileaddr = next;
1510         new->data = ibuf;
1511         ib->data = nbuf;
1512         unmap_iblock_2(new, nbuf);
1513         unmap_iblock(ib, ibuf);
1514         set_bit(B_Valid, &new->b.flags);
1515
1516         /* new needs to be a sibling of ib, and children need to be shared out,
1517          *   distributing pincnt appropriately.
1518          * All of uninc that is remaining goes to 'new', while
1519          * ib->uninc and ib->uninc_next need to be shared out.
1520          * We need to be careful to preserve the relative order of children
1521          * as they might not all be incorporated yet.
1522          */
1523         share_children(ib, new);
1524
1525         cr = 0;
1526         temp_credits = 0;
1527         while (uninc && uninc->fileaddr < next) {
1528                 struct block *b = uninc;
1529                 uninc = uninc->chain;
1530                 cr += incorp_index(b);
1531         }
1532         temp_credits = 0;
1533         *credits += cr;
1534         new->uninc = uninc;
1535         spin_lock(&ib->b.inode->i_data.private_lock);
1536         if (ib->uninc_next)
1537                 share_list(&ib->uninc_next, &new->uninc_next, next);
1538         if (ib->uninc)
1539                 share_list(&ib->uninc, &new->uninc, next);
1540         spin_unlock(&ib->b.inode->i_data.private_lock);
1541
1542         new->depth = ib->depth;
1543         if (ib->b.siblings.next != &ib->b.parent->children &&
1544             test_bit(B_PrimaryRef, &list_entry(ib->b.siblings.next,
1545                                                struct block, siblings)->flags))
1546                 /* Inserting into a PrimaryRef chain */
1547                 getiref(new, MKREF(primary));
1548         else
1549                 getiref(ib, MKREF(primary));
1550         set_bit(B_PrimaryRef, &new->b.flags);
1551         list_add(&new->b.siblings, &ib->b.siblings);
1552         new->b.parent = ib->b.parent;
1553         (void)getiref(new->b.parent, MKREF(child));
1554         new->b.inode = ib->b.inode;
1555         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
1556         set_phase(&new->b, test_bit(B_Phase1, &ib->b.flags));
1557
1558         if (new->uninc->fileaddr == next) {
1559                 struct block *b;
1560                 /* This block really needs to be incorporated, or
1561                  * we might not be able to find it - it might be
1562                  * a primary for other blocks too.
1563                  * So do a minimal incorporation of this block.
1564                  * That must succeed as it will be an addition to an
1565                  * empty block, or a replacement.
1566                  */
1567                 b = new->uninc;
1568                 new->uninc = b->chain;
1569                 b->chain = NULL;
1570                 nbuf = map_iblock(new);
1571                 cr = incorporate_index(b, nbuf, len);
1572                 unmap_iblock(new, nbuf);
1573                 LAFS_BUG(cr < 0, &ib->b);
1574                 *credits += cr;
1575         }
1576
1577         return 2;
1578 }
1579
1580 static void filter_empties(struct block **uninc)
1581 {
1582         /* If any block has phys==0 and there is another block
1583          * with same address, incorp_index the phys==0 block
1584          */
1585         struct block *b = *uninc;
1586         while (b && b->chain) {
1587                 if (b->fileaddr == b->chain->fileaddr) {
1588                         struct block *t;
1589                         if (b->physaddr == 0) {
1590                                 t = b;
1591                                 *uninc = b->chain;
1592                         } else if (b->chain->physaddr == 0) {
1593                                 t = b->chain;
1594                                 b->chain = t->chain;
1595                         } else
1596                                 LAFS_BUG(1, b);
1597                         t->chain = NULL;
1598                         incorp_index(t);
1599                 } else {
1600                         uninc = &b->chain;
1601                         b = *uninc;
1602                 }
1603         }
1604 }
1605
1606 /* Incorporate all the addresses in uninc_table into this
1607  * indexblock.  Each address carried a credit to allow for
1608  * indexblock splitting etc.
1609  * The block is already Dirty if it should go in the main
1610  * cluster, or Realloc if it should go in a cleaning cluster.
1611  */
1612 void lafs_incorporate(struct fs *fs, struct indexblock *ib)
1613 {
1614         int offset;     /* start of block where indexing is */
1615         struct indexblock *new = NULL;
1616         struct block *uninc = NULL;
1617         int rv = 0;
1618         char *buf;
1619         struct uninc uit;
1620         int blocksize = fs->blocksize;
1621
1622         LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
1623
1624         LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1625                  !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1626
1627         LAFS_BUG(!test_bit(B_Valid, &ib->b.flags) && ib->depth > 0, &ib->b);
1628
1629         if (ib->depth <= 1) {
1630                 /* take a copy of the uninc_table so we can work while
1631                  * more changes can be made
1632                  */
1633                 char *b;
1634                 u16 type;
1635                 u32 start;
1636
1637                 if (ib->uninc) {
1638                         struct block *x = ib->uninc;
1639                         while (x) {
1640                                 printk("x=%s\n", strblk(x));
1641                                 x = x->chain;
1642                         }
1643                 }
1644                 LAFS_BUG(ib->uninc, &ib->b);
1645
1646                 spin_lock(&ib->b.inode->i_data.private_lock);
1647                 memcpy(&uit, &ib->uninc_table, sizeof(uit));
1648                 ib->uninc_table.pending_cnt = 0;
1649                 ib->uninc_table.credits = 0;
1650                 spin_unlock(&ib->b.inode->i_data.private_lock);
1651
1652                 if (test_bit(B_InoIdx, &ib->b.flags)) {
1653                         offset = LAFSI(ib->b.inode)->metadata_size;
1654                         if (LAFSI(ib->b.inode)->depth == 0) {
1655                                 /* data has already been copied
1656                                  * out.. I hope - FIXME */
1657                                 LAFSI(ib->b.inode)->depth = 1;
1658                                 ib->depth = 1;
1659                                 lafs_clear_index(ib);
1660                         }
1661                 } else
1662                         offset = 0;
1663
1664                 buf = map_iblock(ib) + offset;
1665                 b = buf;
1666                 type = decode16(b);
1667                 start = decode32(b);
1668
1669                 if (uit.pending_cnt == 0) {
1670                         unmap_iblock(ib, buf-offset);
1671                         goto out;
1672                 }
1673                 /* Check it really is sorted */
1674                 {
1675                         int i;
1676                         for (i=1; i<uit.pending_cnt; i++)
1677                                 if (uit.pending_addr[i].fileaddr <
1678                                     uit.pending_addr[i-1].fileaddr +
1679                                     uit.pending_addr[i-1].cnt)
1680                                         BUG();
1681                 }
1682
1683                 if(!test_bit(B_Dirty, &ib->b.flags) &&
1684                    !test_bit(B_Realloc, &ib->b.flags))
1685                         printk("bad %s\n", strblk(&ib->b));
1686                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1687                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1688
1689                 /* OK, we genuinely have work to do. */
1690                 /* find size and type of current block */
1691
1692                 /* Maybe a direct update of an indirect block */
1693                 if (type == IBLK_INDIRECT &&
1694                     incorporate_indirect(&uit, buf+2,
1695                                          ib->b.fileaddr,
1696                                          blocksize-(offset+2))) {
1697                         unmap_iblock(ib, buf-offset);
1698                         goto out;
1699                 }
1700
1701                 /* Maybe a fairly direct update of an extent block */
1702                 if (type == IBLK_EXTENT &&
1703                     incorporate_extent(&uit, buf+2,
1704                                        blocksize-(offset+2))) {
1705                         unmap_iblock(ib, buf-offset);
1706                         goto out;
1707                 }
1708
1709                 dprintk("Index contains:\n");
1710                 if (lafs_trace)
1711                         print_index(buf, ib->b.fileaddr, blocksize - offset);
1712                 dprintk("uninc contains:\n");
1713                 if (lafs_trace)
1714                         lafs_print_uninc(&uit);
1715
1716                 unmap_iblock(ib, buf-offset);
1717
1718         } else {
1719                 int cred;
1720                 uit.credits = 0;
1721
1722                 LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
1723                 spin_lock(&ib->b.inode->i_data.private_lock);
1724                 uninc = ib->uninc;
1725                 ib->uninc = NULL;
1726                 spin_unlock(&ib->b.inode->i_data.private_lock);
1727
1728                 if (uninc == NULL)
1729                         goto out;
1730
1731                 sort_blocks(&uninc);
1732                 filter_empties(&uninc);
1733
1734                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1735                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1736
1737                 /* OK, we genuinely have work to do. */
1738                 /* find size and type of current block */
1739                 buf = map_iblock(ib);
1740
1741                 if (test_bit(B_InoIdx, &ib->b.flags)) {
1742                         offset = LAFSI(ib->b.inode)->metadata_size;
1743                         buf += offset;
1744                 } else
1745                         offset = 0;
1746
1747                 dprintk("Index contains:\n");
1748                 if (lafs_trace) {
1749                         struct block *b;
1750
1751                         print_index(buf, ib->b.fileaddr, blocksize - offset);
1752                         printk("uninc list:\n");
1753                         for (b = uninc; b ; b=b->chain)
1754                                 printk("  %lu: %llu\n",
1755                                        (unsigned long) b->fileaddr,
1756                                        (unsigned long long) b->physaddr);
1757                 }
1758
1759                 /* internal index block.  Might be able to merge in-place */
1760                 if (*(u16 *)(buf) == cpu_to_le16(IBLK_INDEX) &&
1761                     (cred = incorporate_index(uninc, buf+2,
1762                                               blocksize-(offset+2))) >= 0) {
1763                         unmap_iblock(ib, buf-offset);
1764                         uit.credits = cred;
1765                         goto out;
1766                 }
1767                 unmap_iblock(ib, buf-offset);
1768
1769         }
1770         /* Ok, those were the easy options. Now we need to allocate a
1771          * new index block.
1772          */
1773 retry:
1774         new = lafs_iblock_alloc(fs, GFP_NOFS, 1, MKREF(inc));
1775         /* FIXME need to preallocate something for a fall-back?? */
1776
1777         /* We lock the block despite that fact that it isn't
1778          * shared, so that map_iblock won't complain */
1779         lafs_iolock_block(&new->b);
1780
1781         if (ib->depth < 1)
1782                 printk("small depth %s\n", strblk(&ib->b));
1783         LAFS_BUG(ib->depth < 1, &ib->b);
1784         if (ib->depth == 1)
1785                 rv = do_incorporate_leaf(fs, ib, &uit, new);
1786         else
1787                 rv = do_incorporate_internal(fs, ib, uninc, &uit.credits, new);
1788
1789         switch (rv) {
1790         case 0:
1791                 /* There is nothing in this block any more.
1792                  * If it is an inode, clear it, else punch a hole
1793                  */
1794                 dprintk("incorp to empty off=%d %s\n",
1795                         (int)offset, strblk(&ib->b));
1796                 lafs_iounlock_block(&new->b);
1797                 lafs_iblock_free(new);
1798                 lafs_clear_index(ib);
1799                 break;
1800
1801         case 1: /* everything was incorporated - hurray */
1802                 lafs_iounlock_block(&new->b);
1803                 lafs_iblock_free(new);
1804                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1805                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1806                 /* Don't need to dirty, it is already dirty */
1807                 break;
1808
1809         case 2: /* Simple split */
1810                 /* The addresses are now in 'ib', 'new' and possibly
1811                  * new->uninc_table.  'new' has been linked in to the
1812                  * parent.
1813                  */
1814                 uit.credits --;
1815                 set_bit(B_Credit, &new->b.flags);
1816                 if (uit.credits > 0) {
1817                         set_bit(B_ICredit, &new->b.flags);
1818                         uit.credits--;
1819                 }
1820
1821                 lafs_dirty_iblock(new, !test_bit(B_Dirty, &ib->b.flags));
1822                 lafs_iounlock_block(&new->b);
1823                 temp_credits = uit.credits;
1824                 putiref(new, MKREF(inc));
1825                 break;
1826         case 3: /* Need to grow */
1827                 /* new needs a B_Credit and a B_ICredit.
1828                  */
1829
1830                 uit.credits--;
1831                 set_bit(B_Credit, &new->b.flags);
1832                 if (uit.credits > 0){
1833                         set_bit(B_ICredit, &new->b.flags);
1834                         uit.credits--;
1835                 }
1836
1837                 lafs_dirty_iblock(new, !test_bit(B_Dirty, &ib->b.flags));
1838                 dprintk("Just Grew %s\n", strblk(&new->b));
1839                 dprintk("     from %s\n", strblk(&ib->b));
1840                 lafs_iounlock_block(&new->b);
1841                 temp_credits = uit.credits;
1842                 putiref(new, MKREF(inc));
1843
1844                 goto retry;
1845         }
1846
1847 out:
1848         if (uit.credits > 0 && !test_and_set_bit(B_UnincCredit, &ib->b.flags))
1849                 uit.credits--;
1850         if (uit.credits > 0 && !test_and_set_bit(B_ICredit, &ib->b.flags))
1851                 uit.credits--;
1852         if (uit.credits > 0 && !test_and_set_bit(B_NICredit, &ib->b.flags))
1853                 uit.credits--;
1854         if (uit.credits < 0) {
1855                 printk("Credits = %d, rv=%d\n", uit.credits, rv);
1856                 printk("ib = %s\n", strblk(&ib->b));
1857         }
1858         temp_credits = 0;
1859         lafs_space_return(fs, uit.credits);
1860
1861         /* If this index block is now empty and has no
1862          * children which are not EmptyIndex, we flag it
1863          * as EmptyIndex so index lookups know to avoid it.
1864          * Dirty children also delay empty-handling.  This
1865          * is important at the InoIdx level as we cannot drop
1866          * level to 1 while there are still children that
1867          * might want to be incorporated.
1868          */
1869         if (ib->depth > 1 && ! lafs_index_empty(ib))
1870                 goto out2;
1871         if (ib->depth > 1) {
1872                 int non_empty = 0;
1873                 struct block *b;
1874                 spin_lock(&ib->b.inode->i_data.private_lock);
1875                 list_for_each_entry(b, &ib->children, siblings)
1876                         if (!test_bit(B_EmptyIndex, &b->flags) ||
1877                             test_bit(B_Dirty, &b->flags) ||
1878                             test_bit(B_Realloc, &b->flags)) {
1879                                 non_empty = 1;
1880                                 break;
1881                         }
1882                 spin_unlock(&ib->b.inode->i_data.private_lock);
1883                 if (non_empty)
1884                         goto out2;
1885         }
1886         if (test_bit(B_InoIdx, &ib->b.flags)) {
1887                 /* Empty InoIdx blocks are allowed.  However depth must
1888                  * be 1.  This is where we suddenly collapse a now-empty
1889                  * indexing tree.
1890                  */
1891                 if (ib->depth > 1) {
1892                         ib->depth = 1;
1893                         LAFSI(ib->b.inode)->depth = 1;
1894                         lafs_clear_index(ib);
1895                 }
1896                 /* Don't need to dirty the inode - the fact that we just
1897                  * did incorporation should ensure it is already dirty
1898                  */
1899                 goto out2;
1900         }
1901         if (ib->depth == 1 &&
1902             (lafs_leaf_next(ib, 0) != 0xFFFFFFFF ||
1903              !list_empty(&ib->children)))
1904                 goto out2;
1905
1906         /* Block is really really empty */
1907         set_bit(B_EmptyIndex, &ib->b.flags);
1908 out2:
1909         return;
1910 }
1911
1912 /***************************************************************
1913  * Space pre-allocation
1914  * We need to make sure that the block and all parents
1915  * have a full complement of space credits.
1916  * If this cannot be achieved, we can fail with either
1917  * -ENOSPC if we were allocating more space, or -EAGAIN.
1918  */
1919
1920 int __must_check lafs_prealloc(struct block *blk, int why)
1921 {
1922         struct fs *fs = fs_from_inode(blk->inode);
1923         struct block *b;
1924         int need;
1925         int credits = 0;
1926
1927         LAFS_BUG(why != CleanSpace && why != AccountSpace
1928                  && !test_phase_locked(fs)
1929                  && !fs->checkpointing, blk);
1930
1931 retry:
1932         need = 0;
1933         b = blk;
1934
1935         while (b) {
1936                 if (!test_bit(B_Dirty, &b->flags)) {
1937                         if (credits <= 0)
1938                                 need += !test_bit(B_Credit, &b->flags);
1939                         else if (!test_and_set_bit(B_Credit, &b->flags))
1940                                 credits--;
1941                 }
1942                 if (!test_bit(B_UnincCredit, &b->flags)) {
1943                         if (credits <= 0)
1944                                 need += !test_bit(B_ICredit, &b->flags);
1945                         else if (!test_and_set_bit(B_ICredit, &b->flags))
1946                                 credits--;
1947                 }
1948                 /* We need N*Credits if this block might need to be
1949                  * phase-flipped and remain pinned in the next
1950                  * phase.  i.e. index blocks and accounting blocks.
1951                  */
1952                 if (test_bit(B_Index, &b->flags) ||
1953                     LAFSI(b->inode)->type == TypeQuota ||
1954                     LAFSI(b->inode)->type == TypeSegmentMap) {
1955                         if (credits <= 0)
1956                                 need += !test_bit(B_NCredit, &b->flags);
1957                         else if (!test_and_set_bit(B_NCredit, &b->flags))
1958                                 credits--;
1959                         if (credits <= 0)
1960                                 need += !test_bit(B_NICredit, &b->flags);
1961                         else if (!test_and_set_bit(B_NICredit, &b->flags))
1962                                 credits--;
1963                 }
1964                 if (test_bit(B_InoIdx, &b->flags))
1965                         b = &LAFSI(b->inode)->dblock->b;
1966                 else
1967                         b = &b->parent->b;
1968         }
1969         dprintk("need=%d credits=%d for %s\n", need, credits, strblk(blk));
1970         if (need == 0) {
1971                 /* all needed credits allocated */
1972                 lafs_space_return(fs, credits);
1973                 return 0;
1974         }
1975
1976         if (lafs_space_alloc(fs, need, why)) {
1977                 dprintk("Got them for %s\n", strblk(blk));
1978                 /* We got the credits we asked for */
1979                 credits = need;
1980                 goto retry;
1981         }
1982         lafs_space_return(fs, credits);
1983
1984         return -ENOSPC;
1985 }