]> git.neil.brown.name Git - mdadm.git/blob - restripe.c
Grow: Handle failure to load superblock in Grow_addbitmap()
[mdadm.git] / restripe.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24
25 #include "mdadm.h"
26 #include <stdint.h>
27
28 /* To restripe, we read from old geometry to a buffer, and
29  * read from buffer to new geometry.
30  * When reading, we might have missing devices and so could need
31  * to reconstruct.
32  * When writing, we need to create correct parity and Q.
33  *
34  */
35
36 int geo_map(int block, unsigned long long stripe, int raid_disks,
37                    int level, int layout)
38 {
39         /* On the given stripe, find which disk in the array will have
40          * block numbered 'block'.
41          * '-1' means the parity block.
42          * '-2' means the Q syndrome.
43          */
44         int pd;
45
46         /* layout is not relevant for raid0 and raid4 */
47         if ((level == 0) ||
48             (level == 4))
49                 layout = 0;
50
51         switch(level*100 + layout) {
52         case 000:
53         case 400:
54         case 500 + ALGORITHM_PARITY_N:
55                 /* raid 4 isn't messed around by parity blocks */
56                 if (block == -1)
57                         return raid_disks-1; /* parity block */
58                 return block;
59         case 500 + ALGORITHM_LEFT_ASYMMETRIC:
60                 pd = (raid_disks-1) - stripe % raid_disks;
61                 if (block == -1) return pd;
62                 if (block >= pd)
63                         block++;
64                 return block;
65
66         case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
67                 pd = stripe % raid_disks;
68                 if (block == -1) return pd;
69                 if (block >= pd)
70                         block++;
71                 return block;
72
73         case 500 + ALGORITHM_LEFT_SYMMETRIC:
74                 pd = (raid_disks - 1) - stripe % raid_disks;
75                 if (block == -1) return pd;
76                 return (pd + 1 + block) % raid_disks;
77
78         case 500 + ALGORITHM_RIGHT_SYMMETRIC:
79                 pd = stripe % raid_disks;
80                 if (block == -1) return pd;
81                 return (pd + 1 + block) % raid_disks;
82
83         case 500 + ALGORITHM_PARITY_0:
84                 return block + 1;
85
86         case 600 + ALGORITHM_PARITY_N_6:
87                 if (block == -2)
88                         return raid_disks - 1;
89                 if (block == -1)
90                         return raid_disks - 2; /* parity block */
91                 return block;
92         case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
93                 if (block == -2)
94                         return raid_disks - 1;
95                 raid_disks--;
96                 pd = (raid_disks-1) - stripe % raid_disks;
97                 if (block == -1) return pd;
98                 if (block >= pd)
99                         block++;
100                 return block;
101
102         case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
103                 if (block == -2)
104                         return raid_disks - 1;
105                 raid_disks--;
106                 pd = stripe % raid_disks;
107                 if (block == -1) return pd;
108                 if (block >= pd)
109                         block++;
110                 return block;
111
112         case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
113                 if (block == -2)
114                         return raid_disks - 1;
115                 raid_disks--;
116                 pd = (raid_disks - 1) - stripe % raid_disks;
117                 if (block == -1) return pd;
118                 return (pd + 1 + block) % raid_disks;
119
120         case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
121                 if (block == -2)
122                         return raid_disks - 1;
123                 raid_disks--;
124                 pd = stripe % raid_disks;
125                 if (block == -1) return pd;
126                 return (pd + 1 + block) % raid_disks;
127
128         case 600 + ALGORITHM_PARITY_0_6:
129                 if (block == -2)
130                         return raid_disks - 1;
131                 return block + 1;
132
133         case 600 + ALGORITHM_PARITY_0:
134                 if (block == -1)
135                         return 0;
136                 if (block == -2)
137                         return 1;
138                 return block + 2;
139
140         case 600 + ALGORITHM_LEFT_ASYMMETRIC:
141                 pd = raid_disks - 1 - (stripe % raid_disks);
142                 if (block == -1) return pd;
143                 if (block == -2) return (pd+1) % raid_disks;
144                 if (pd == raid_disks - 1)
145                         return block+1;
146                 if (block >= pd)
147                         return block+2;
148                 return block;
149
150         case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
151                 /* Different order for calculating Q, otherwize same as ... */
152         case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
153                 pd = stripe % raid_disks;
154                 if (block == -1) return pd;
155                 if (block == -2) return (pd+1) % raid_disks;
156                 if (pd == raid_disks - 1)
157                         return block+1;
158                 if (block >= pd)
159                         return block+2;
160                 return block;
161
162         case 600 + ALGORITHM_LEFT_SYMMETRIC:
163                 pd = raid_disks - 1 - (stripe % raid_disks);
164                 if (block == -1) return pd;
165                 if (block == -2) return (pd+1) % raid_disks;
166                 return (pd + 2 + block) % raid_disks;
167
168         case 600 + ALGORITHM_RIGHT_SYMMETRIC:
169                 pd = stripe % raid_disks;
170                 if (block == -1) return pd;
171                 if (block == -2) return (pd+1) % raid_disks;
172                 return (pd + 2 + block) % raid_disks;
173
174         case 600 + ALGORITHM_ROTATING_N_RESTART:
175                 /* Same a left_asymmetric, by first stripe is
176                  * D D D P Q  rather than
177                  * Q D D D P
178                  */
179                 pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
180                 if (block == -1) return pd;
181                 if (block == -2) return (pd+1) % raid_disks;
182                 if (pd == raid_disks - 1)
183                         return block+1;
184                 if (block >= pd)
185                         return block+2;
186                 return block;
187
188         case 600 + ALGORITHM_ROTATING_N_CONTINUE:
189                 /* Same as left_symmetric but Q is before P */
190                 pd = raid_disks - 1 - (stripe % raid_disks);
191                 if (block == -1) return pd;
192                 if (block == -2) return (pd+raid_disks-1) % raid_disks;
193                 return (pd + 1 + block) % raid_disks;
194         }
195         return -1;
196 }
197
198 int is_ddf(int layout)
199 {
200         switch (layout)
201         {
202         default:
203                 return 0;
204         case ALGORITHM_ROTATING_N_CONTINUE:
205         case ALGORITHM_ROTATING_N_RESTART:
206         case ALGORITHM_ROTATING_ZERO_RESTART:
207                 return 1;
208         }
209 }
210
211 void xor_blocks(char *target, char **sources, int disks, int size)
212 {
213         int i, j;
214         /* Amazingly inefficient... */
215         for (i=0; i<size; i++) {
216                 char c = 0;
217                 for (j=0 ; j<disks; j++)
218                         c ^= sources[j][i];
219                 target[i] = c;
220         }
221 }
222
223 void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
224 {
225         int d, z;
226         uint8_t wq0, wp0, wd0, w10, w20;
227         for ( d = 0; d < size; d++) {
228                 wq0 = wp0 = sources[disks-1][d];
229                 for ( z = disks-2 ; z >= 0 ; z-- ) {
230                         wd0 = sources[z][d];
231                         wp0 ^= wd0;
232                         w20 = (wq0&0x80) ? 0xff : 0x00;
233                         w10 = (wq0 << 1) & 0xff;
234                         w20 &= 0x1d;
235                         w10 ^= w20;
236                         wq0 = w10 ^ wd0;
237                 }
238                 p[d] = wp0;
239                 q[d] = wq0;
240         }
241 }
242
243 /*
244  * The following was taken from linux/drivers/md/mktables.c, and modified
245  * to create in-memory tables rather than C code
246  */
247 static uint8_t gfmul(uint8_t a, uint8_t b)
248 {
249         uint8_t v = 0;
250
251         while (b) {
252                 if (b & 1)
253                         v ^= a;
254                 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
255                 b >>= 1;
256         }
257
258         return v;
259 }
260
261 static uint8_t gfpow(uint8_t a, int b)
262 {
263         uint8_t v = 1;
264
265         b %= 255;
266         if (b < 0)
267                 b += 255;
268
269         while (b) {
270                 if (b & 1)
271                         v = gfmul(v, a);
272                 a = gfmul(a, a);
273                 b >>= 1;
274         }
275
276         return v;
277 }
278
279 int tables_ready = 0;
280 uint8_t raid6_gfmul[256][256];
281 uint8_t raid6_gfexp[256];
282 uint8_t raid6_gfinv[256];
283 uint8_t raid6_gfexi[256];
284 uint8_t raid6_gflog[256];
285 uint8_t raid6_gfilog[256];
286 void make_tables(void)
287 {
288         int i, j;
289         uint8_t v;
290         uint32_t b, log;
291
292         /* Compute multiplication table */
293         for (i = 0; i < 256; i++)
294                 for (j = 0; j < 256; j++)
295                                 raid6_gfmul[i][j] = gfmul(i, j);
296
297         /* Compute power-of-2 table (exponent) */
298         v = 1;
299         for (i = 0; i < 256; i++) {
300                 raid6_gfexp[i] = v;
301                 v = gfmul(v, 2);
302                 if (v == 1)
303                         v = 0;  /* For entry 255, not a real entry */
304         }
305
306         /* Compute inverse table x^-1 == x^254 */
307         for (i = 0; i < 256; i++)
308                 raid6_gfinv[i] = gfpow(i, 254);
309
310         /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
311         for (i = 0; i < 256; i ++)
312                 raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
313
314         /* Compute log and inverse log */
315         /* Modified code from:
316          *    http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
317          */
318         b = 1;
319         raid6_gflog[0] = 0;
320         raid6_gfilog[255] = 0;
321
322         for (log = 0; log < 255; log++) {
323                 raid6_gflog[b] = (uint8_t) log;
324                 raid6_gfilog[log] = (uint8_t) b;
325                 b = b << 1;
326                 if (b & 256) b = b ^ 0435;
327         }
328
329         tables_ready = 1;
330 }
331
332 uint8_t *zero;
333 int zero_size;
334
335 void ensure_zero_has_size(int chunk_size)
336 {
337         if (zero == NULL || chunk_size > zero_size) {
338                 if (zero)
339                         free(zero);
340                 zero = xcalloc(1, chunk_size);
341                 zero_size = chunk_size;
342         }
343 }
344
345 /* Following was taken from linux/drivers/md/raid6recov.c */
346
347 /* Recover two failed data blocks. */
348
349 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
350                        uint8_t **ptrs, int neg_offset)
351 {
352         uint8_t *p, *q, *dp, *dq;
353         uint8_t px, qx, db;
354         const uint8_t *pbmul;   /* P multiplier table for B data */
355         const uint8_t *qmul;            /* Q multiplier table (for both) */
356
357         if (faila > failb) {
358                 int t = faila;
359                 faila = failb;
360                 failb = t;
361         }
362
363         if (neg_offset) {
364                 p = ptrs[-1];
365                 q = ptrs[-2];
366         } else {
367                 p = ptrs[disks-2];
368                 q = ptrs[disks-1];
369         }
370
371         /* Compute syndrome with zero for the missing data pages
372            Use the dead data pages as temporary storage for
373            delta p and delta q */
374         dp = ptrs[faila];
375         ptrs[faila] = zero;
376         dq = ptrs[failb];
377         ptrs[failb] = zero;
378
379         qsyndrome(dp, dq, ptrs, disks-2, bytes);
380
381         /* Restore pointer table */
382         ptrs[faila]   = dp;
383         ptrs[failb]   = dq;
384
385         /* Now, pick the proper data tables */
386         pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
387         qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
388
389         /* Now do it... */
390         while ( bytes-- ) {
391                 px    = *p ^ *dp;
392                 qx    = qmul[*q ^ *dq];
393                 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
394                 *dp++ = db ^ px; /* Reconstructed A */
395                 p++; q++;
396         }
397 }
398
399 /* Recover failure of one data block plus the P block */
400 void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
401                        int neg_offset)
402 {
403         uint8_t *p, *q, *dq;
404         const uint8_t *qmul;            /* Q multiplier table */
405
406         if (neg_offset) {
407                 p = ptrs[-1];
408                 q = ptrs[-2];
409         } else {
410                 p = ptrs[disks-2];
411                 q = ptrs[disks-1];
412         }
413
414         /* Compute syndrome with zero for the missing data page
415            Use the dead data page as temporary storage for delta q */
416         dq = ptrs[faila];
417         ptrs[faila] = zero;
418
419         qsyndrome(p, dq, ptrs, disks-2, bytes);
420
421         /* Restore pointer table */
422         ptrs[faila]   = dq;
423
424         /* Now, pick the proper data tables */
425         qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
426
427         /* Now do it... */
428         while ( bytes-- ) {
429                 *p++ ^= *dq = qmul[*q ^ *dq];
430                 q++; dq++;
431         }
432 }
433
434 /* Try to find out if a specific disk has a problem */
435 int raid6_check_disks(int data_disks, int start, int chunk_size,
436                       int level, int layout, int diskP, int diskQ,
437                       uint8_t *p, uint8_t *q, char **stripes)
438 {
439         int i;
440         int data_id, diskD;
441         uint8_t Px, Qx;
442         int curr_broken_disk = -1;
443         int prev_broken_disk = -1;
444         int broken_status = 0;
445
446         for(i = 0; i < chunk_size; i++) {
447                 Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
448                 Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
449
450                 if((Px != 0) && (Qx == 0))
451                         curr_broken_disk = diskP;
452
453                 if((Px == 0) && (Qx != 0))
454                         curr_broken_disk = diskQ;
455
456                 if((Px != 0) && (Qx != 0)) {
457                         data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
458                         if(data_id < 0) data_id += 255;
459                         diskD = geo_map(data_id, start/chunk_size,
460                                         data_disks + 2, level, layout);
461                         curr_broken_disk = diskD;
462                 }
463
464                 if((Px == 0) && (Qx == 0))
465                         curr_broken_disk = curr_broken_disk;
466
467                 if(curr_broken_disk >= data_disks + 2)
468                         broken_status = 2;
469
470                 switch(broken_status) {
471                 case 0:
472                         if(curr_broken_disk != -1) {
473                                 prev_broken_disk = curr_broken_disk;
474                                 broken_status = 1;
475                         }
476                         break;
477
478                 case 1:
479                         if(curr_broken_disk != prev_broken_disk)
480                                 broken_status = 2;
481                         break;
482
483                 case 2:
484                 default:
485                         curr_broken_disk = prev_broken_disk = -2;
486                         break;
487                 }
488         }
489
490         return curr_broken_disk;
491 }
492
493 /*******************************************************************************
494  * Function:    save_stripes
495  * Description:
496  *      Function reads data (only data without P and Q) from array and writes
497  * it to buf and opcjonaly to backup files
498  * Parameters:
499  *      source          : A list of 'fds' of the active disks.
500  *                        Some may be absent
501  *      offsets         : A list of offsets on disk belonging
502  *                       to the array [bytes]
503  *      raid_disks      : geometry: number of disks in the array
504  *      chunk_size      : geometry: chunk size [bytes]
505  *      level           : geometry: RAID level
506  *      layout          : geometry: layout
507  *      nwrites         : number of backup files
508  *      dest            : A list of 'fds' for mirrored targets
509  *                        (e.g. backup files). They are already seeked to right
510  *                        (write) location. If NULL, data will be wrote
511  *                        to the buf only
512  *      start           : start address of data to read (must be stripe-aligned)
513  *                        [bytes]
514  *      length  -       : length of data to read (must be stripe-aligned)
515  *                        [bytes]
516  *      buf             : buffer for data. It is large enough to hold
517  *                        one stripe. It is stripe aligned
518  * Returns:
519  *       0 : success
520  *      -1 : fail
521  ******************************************************************************/
522 int save_stripes(int *source, unsigned long long *offsets,
523                  int raid_disks, int chunk_size, int level, int layout,
524                  int nwrites, int *dest,
525                  unsigned long long start, unsigned long long length,
526                  char *buf)
527 {
528         int len;
529         int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
530         int disk;
531         int i;
532         unsigned long long length_test;
533
534         if (!tables_ready)
535                 make_tables();
536         ensure_zero_has_size(chunk_size);
537
538         len = data_disks * chunk_size;
539         length_test = length / len;
540         length_test *= len;
541
542         if (length != length_test) {
543                 dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
544                 dprintf("\tArea for saving stripes (length) = %llu\n", length);
545                 dprintf("\tWork step (len)                  = %i\n", len);
546                 dprintf("\tExpected save area (length_test) = %llu\n",
547                         length_test);
548                 abort();
549         }
550
551         while (length > 0) {
552                 int failed = 0;
553                 int fdisk[3], fblock[3];
554                 for (disk = 0; disk < raid_disks ; disk++) {
555                         unsigned long long offset;
556                         int dnum;
557
558                         offset = (start/chunk_size/data_disks)*chunk_size;
559                         dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
560                                        start/chunk_size/data_disks,
561                                        raid_disks, level, layout);
562                         if (dnum < 0) abort();
563                         if (source[dnum] < 0 ||
564                             lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 ||
565                             read(source[dnum], buf+disk * chunk_size, chunk_size)
566                             != chunk_size)
567                                 if (failed <= 2) {
568                                         fdisk[failed] = dnum;
569                                         fblock[failed] = disk;
570                                         failed++;
571                                 }
572                 }
573                 if (failed == 0 || fblock[0] >= data_disks)
574                         /* all data disks are good */
575                         ;
576                 else if (failed == 1 || fblock[1] >= data_disks+1) {
577                         /* one failed data disk and good parity */
578                         char *bufs[data_disks];
579                         for (i=0; i < data_disks; i++)
580                                 if (fblock[0] == i)
581                                         bufs[i] = buf + data_disks*chunk_size;
582                                 else
583                                         bufs[i] = buf + i*chunk_size;
584
585                         xor_blocks(buf + fblock[0]*chunk_size,
586                                    bufs, data_disks, chunk_size);
587                 } else if (failed > 2 || level != 6)
588                         /* too much failure */
589                         return -1;
590                 else {
591                         /* RAID6 computations needed. */
592                         uint8_t *bufs[data_disks+4];
593                         int qdisk;
594                         int syndrome_disks;
595                         disk = geo_map(-1, start/chunk_size/data_disks,
596                                        raid_disks, level, layout);
597                         qdisk = geo_map(-2, start/chunk_size/data_disks,
598                                        raid_disks, level, layout);
599                         if (is_ddf(layout)) {
600                                 /* q over 'raid_disks' blocks, in device order.
601                                  * 'p' and 'q' get to be all zero
602                                  */
603                                 for (i = 0; i < raid_disks; i++)
604                                         bufs[i] = zero;
605                                 for (i = 0; i < data_disks; i++) {
606                                         int dnum = geo_map(i,
607                                                            start/chunk_size/data_disks,
608                                                            raid_disks, level, layout);
609                                         int snum;
610                                         /* i is the logical block number, so is index to 'buf'.
611                                          * dnum is physical disk number
612                                          * and thus the syndrome number.
613                                          */
614                                         snum = dnum;
615                                         bufs[snum] = (uint8_t*)buf + chunk_size * i;
616                                 }
617                                 syndrome_disks = raid_disks;
618                         } else {
619                                 /* for md, q is over 'data_disks' blocks,
620                                  * starting immediately after 'q'
621                                  * Note that for the '_6' variety, the p block
622                                  * makes a hole that we need to be careful of.
623                                  */
624                                 int j;
625                                 int snum = 0;
626                                 for (j = 0; j < raid_disks; j++) {
627                                         int dnum = (qdisk + 1 + j) % raid_disks;
628                                         if (dnum == disk || dnum == qdisk)
629                                                 continue;
630                                         for (i = 0; i < data_disks; i++)
631                                                 if (geo_map(i,
632                                                             start/chunk_size/data_disks,
633                                                             raid_disks, level, layout) == dnum)
634                                                         break;
635                                         /* i is the logical block number, so is index to 'buf'.
636                                          * dnum is physical disk number
637                                          * snum is syndrome disk for which 0 is immediately after Q
638                                          */
639                                         bufs[snum] = (uint8_t*)buf + chunk_size * i;
640
641                                         if (fblock[0] == i)
642                                                 fdisk[0] = snum;
643                                         if (fblock[1] == i)
644                                                 fdisk[1] = snum;
645                                         snum++;
646                                 }
647
648                                 syndrome_disks = data_disks;
649                         }
650
651                         /* Place P and Q blocks at end of bufs */
652                         bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
653                         bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
654
655                         if (fblock[1] == data_disks)
656                                 /* One data failed, and parity failed */
657                                 raid6_datap_recov(syndrome_disks+2, chunk_size,
658                                                   fdisk[0], bufs, 0);
659                         else {
660                                 /* Two data blocks failed, P,Q OK */
661                                 raid6_2data_recov(syndrome_disks+2, chunk_size,
662                                                   fdisk[0], fdisk[1], bufs, 0);
663                         }
664                 }
665                 if (dest) {
666                         for (i = 0; i < nwrites; i++)
667                                 if (write(dest[i], buf, len) != len)
668                                         return -1;
669                 } else {
670                         /* build next stripe in buffer */
671                         buf += len;
672                 }
673                 length -= len;
674                 start += len;
675         }
676         return 0;
677 }
678
679 /* Restore data:
680  * We are given:
681  *  A list of 'fds' of the active disks. Some may be '-1' for not-available.
682  *  A geometry: raid_disks, chunk_size, level, layout
683  *  An 'fd' to read from.  It is already seeked to the right (Read) location.
684  *  A start and length.
685  * The length must be a multiple of the stripe size.
686  *
687  * We build a full stripe in memory and then write it out.
688  * We assume that there are enough working devices.
689  */
690 int restore_stripes(int *dest, unsigned long long *offsets,
691                     int raid_disks, int chunk_size, int level, int layout,
692                     int source, unsigned long long read_offset,
693                     unsigned long long start, unsigned long long length,
694                     char *src_buf)
695 {
696         char *stripe_buf;
697         char **stripes = xmalloc(raid_disks * sizeof(char*));
698         char **blocks = xmalloc(raid_disks * sizeof(char*));
699         int i;
700         int rv;
701
702         int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
703
704         if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
705                 stripe_buf = NULL;
706
707         if (zero == NULL || chunk_size > zero_size) {
708                 if (zero)
709                         free(zero);
710                 zero = xcalloc(1, chunk_size);
711                 zero_size = chunk_size;
712         }
713
714         if (stripe_buf == NULL || stripes == NULL || blocks == NULL
715             || zero == NULL) {
716                 rv = -2;
717                 goto abort;
718         }
719         for (i = 0; i < raid_disks; i++)
720                 stripes[i] = stripe_buf + i * chunk_size;
721         while (length > 0) {
722                 unsigned int len = data_disks * chunk_size;
723                 unsigned long long offset;
724                 int disk, qdisk;
725                 int syndrome_disks;
726                 if (length < len) {
727                         rv = -3;
728                         goto abort;
729                 }
730                 for (i = 0; i < data_disks; i++) {
731                         int disk = geo_map(i, start/chunk_size/data_disks,
732                                            raid_disks, level, layout);
733                         if (src_buf == NULL) {
734                                 /* read from file */
735                                 if (lseek64(source, read_offset, 0) !=
736                                          (off64_t)read_offset) {
737                                         rv = -1;
738                                         goto abort;
739                                 }
740                                 if (read(source,
741                                          stripes[disk],
742                                          chunk_size) != chunk_size) {
743                                         rv = -1;
744                                         goto abort;
745                                 }
746                         } else {
747                                 /* read from input buffer */
748                                 memcpy(stripes[disk],
749                                        src_buf + read_offset,
750                                        chunk_size);
751                         }
752                         read_offset += chunk_size;
753                 }
754                 /* We have the data, now do the parity */
755                 offset = (start/chunk_size/data_disks) * chunk_size;
756                 switch (level) {
757                 case 4:
758                 case 5:
759                         disk = geo_map(-1, start/chunk_size/data_disks,
760                                            raid_disks, level, layout);
761                         for (i = 0; i < data_disks; i++)
762                                 blocks[i] = stripes[(disk+1+i) % raid_disks];
763                         xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
764                         break;
765                 case 6:
766                         disk = geo_map(-1, start/chunk_size/data_disks,
767                                        raid_disks, level, layout);
768                         qdisk = geo_map(-2, start/chunk_size/data_disks,
769                                        raid_disks, level, layout);
770                         if (is_ddf(layout)) {
771                                 /* q over 'raid_disks' blocks, in device order.
772                                  * 'p' and 'q' get to be all zero
773                                  */
774                                 for (i = 0; i < raid_disks; i++)
775                                         if (i == disk || i == qdisk)
776                                                 blocks[i] = (char*)zero;
777                                         else
778                                                 blocks[i] = stripes[i];
779                                 syndrome_disks = raid_disks;
780                         } else {
781                                 /* for md, q is over 'data_disks' blocks,
782                                  * starting immediately after 'q'
783                                  */
784                                 for (i = 0; i < data_disks; i++)
785                                         blocks[i] = stripes[(qdisk+1+i) % raid_disks];
786
787                                 syndrome_disks = data_disks;
788                         }
789                         qsyndrome((uint8_t*)stripes[disk],
790                                   (uint8_t*)stripes[qdisk],
791                                   (uint8_t**)blocks,
792                                   syndrome_disks, chunk_size);
793                         break;
794                 }
795                 for (i=0; i < raid_disks ; i++)
796                         if (dest[i] >= 0) {
797                                 if (lseek64(dest[i],
798                                          offsets[i]+offset, 0) < 0) {
799                                         rv = -1;
800                                         goto abort;
801                                 }
802                                 if (write(dest[i], stripes[i],
803                                          chunk_size) != chunk_size) {
804                                         rv = -1;
805                                         goto abort;
806                                 }
807                         }
808                 length -= len;
809                 start += len;
810         }
811         rv = 0;
812
813 abort:
814         free(stripe_buf);
815         free(stripes);
816         free(blocks);
817         return rv;
818 }
819
820 #ifdef MAIN
821
822 int test_stripes(int *source, unsigned long long *offsets,
823                  int raid_disks, int chunk_size, int level, int layout,
824                  unsigned long long start, unsigned long long length)
825 {
826         /* ready the data and p (and q) blocks, and check we got them right */
827         char *stripe_buf = xmalloc(raid_disks * chunk_size);
828         char **stripes = xmalloc(raid_disks * sizeof(char*));
829         char **blocks = xmalloc(raid_disks * sizeof(char*));
830         uint8_t *p = xmalloc(chunk_size);
831         uint8_t *q = xmalloc(chunk_size);
832
833         int i;
834         int diskP, diskQ;
835         int data_disks = raid_disks - (level == 5 ? 1: 2);
836
837         if (!tables_ready)
838                 make_tables();
839
840         for ( i = 0 ; i < raid_disks ; i++)
841                 stripes[i] = stripe_buf + i * chunk_size;
842
843         while (length > 0) {
844                 int disk;
845
846                 for (i = 0 ; i < raid_disks ; i++) {
847                         lseek64(source[i], offsets[i]+start, 0);
848                         read(source[i], stripes[i], chunk_size);
849                 }
850                 for (i = 0 ; i < data_disks ; i++) {
851                         int disk = geo_map(i, start/chunk_size, raid_disks,
852                                            level, layout);
853                         blocks[i] = stripes[disk];
854                         printf("%d->%d\n", i, disk);
855                 }
856                 switch(level) {
857                 case 6:
858                         qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
859                         diskP = geo_map(-1, start/chunk_size, raid_disks,
860                                        level, layout);
861                         if (memcmp(p, stripes[diskP], chunk_size) != 0) {
862                                 printf("P(%d) wrong at %llu\n", diskP,
863                                        start / chunk_size);
864                         }
865                         diskQ = geo_map(-2, start/chunk_size, raid_disks,
866                                        level, layout);
867                         if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
868                                 printf("Q(%d) wrong at %llu\n", diskQ,
869                                        start / chunk_size);
870                         }
871                         disk = raid6_check_disks(data_disks, start, chunk_size,
872                                                  level, layout, diskP, diskQ,
873                                                  p, q, stripes);
874                         if(disk >= 0) {
875                           printf("Possible failed disk: %d\n", disk);
876                         }
877                         if(disk == -2) {
878                           printf("Failure detected, but disk unknown\n");
879                         }
880                         break;
881                 }
882                 length -= chunk_size;
883                 start += chunk_size;
884         }
885         return 0;
886 }
887
888 unsigned long long getnum(char *str, char **err)
889 {
890         char *e;
891         unsigned long long rv = strtoull(str, &e, 10);
892         if (e==str || *e) {
893                 *err = str;
894                 return 0;
895         }
896         return rv;
897 }
898
899 char const Name[] = "test_restripe";
900 int main(int argc, char *argv[])
901 {
902         /* save/restore file raid_disks chunk_size level layout start length devices...
903          */
904         int save;
905         int *fds;
906         char *file;
907         char *buf;
908         int storefd;
909         unsigned long long *offsets;
910         int raid_disks, chunk_size, level, layout;
911         unsigned long long start, length;
912         int i;
913
914         char *err = NULL;
915         if (argc < 10) {
916                 fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
917                 exit(1);
918         }
919         if (strcmp(argv[1], "save")==0)
920                 save = 1;
921         else if (strcmp(argv[1], "restore") == 0)
922                 save = 0;
923         else if (strcmp(argv[1], "test") == 0)
924                 save = 2;
925         else {
926                 fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
927                 exit(2);
928         }
929
930         file = argv[2];
931         raid_disks = getnum(argv[3], &err);
932         chunk_size = getnum(argv[4], &err);
933         level = getnum(argv[5], &err);
934         layout = getnum(argv[6], &err);
935         start = getnum(argv[7], &err);
936         length = getnum(argv[8], &err);
937         if (err) {
938                 fprintf(stderr, "test_stripe: Bad number: %s\n", err);
939                 exit(2);
940         }
941         if (argc != raid_disks + 9) {
942                 fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
943                         raid_disks, argc-9);
944                 exit(2);
945         }
946         fds = xmalloc(raid_disks * sizeof(*fds));
947         offsets = xcalloc(raid_disks, sizeof(*offsets));
948
949         storefd = open(file, O_RDWR);
950         if (storefd < 0) {
951                 perror(file);
952                 fprintf(stderr, "test_stripe: could not open %s.\n", file);
953                 exit(3);
954         }
955         for (i=0; i<raid_disks; i++) {
956                 char *p;
957                 p = strchr(argv[9+i], ':');
958
959                 if(p != NULL) {
960                         *p++ = '\0';
961                         offsets[i] = atoll(p) * 512;
962                 }
963
964                 fds[i] = open(argv[9+i], O_RDWR);
965                 if (fds[i] < 0) {
966                         perror(argv[9+i]);
967                         fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
968                         exit(3);
969                 }
970         }
971
972         buf = xmalloc(raid_disks * chunk_size);
973
974         if (save == 1) {
975                 int rv = save_stripes(fds, offsets,
976                                       raid_disks, chunk_size, level, layout,
977                                       1, &storefd,
978                                       start, length, buf);
979                 if (rv != 0) {
980                         fprintf(stderr,
981                                 "test_stripe: save_stripes returned %d\n", rv);
982                         exit(1);
983                 }
984         } else if (save == 2) {
985                 int rv = test_stripes(fds, offsets,
986                                       raid_disks, chunk_size, level, layout,
987                                       start, length);
988                 if (rv != 0) {
989                         fprintf(stderr,
990                                 "test_stripe: test_stripes returned %d\n", rv);
991                         exit(1);
992                 }
993         } else {
994                 int rv = restore_stripes(fds, offsets,
995                                          raid_disks, chunk_size, level, layout,
996                                          storefd, 0ULL,
997                                          start, length, NULL);
998                 if (rv != 0) {
999                         fprintf(stderr,
1000                                 "test_stripe: restore_stripes returned %d\n",
1001                                 rv);
1002                         exit(1);
1003                 }
1004         }
1005         exit(0);
1006 }
1007
1008 #endif /* MAIN */