]> git.neil.brown.name Git - mdadm.git/blob - restripe.c
Release mdadm-4.0
[mdadm.git] / restripe.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24
25 #include "mdadm.h"
26 #include <stdint.h>
27
28 /* To restripe, we read from old geometry to a buffer, and
29  * read from buffer to new geometry.
30  * When reading, we might have missing devices and so could need
31  * to reconstruct.
32  * When writing, we need to create correct parity and Q.
33  *
34  */
35
36 int geo_map(int block, unsigned long long stripe, int raid_disks,
37                    int level, int layout)
38 {
39         /* On the given stripe, find which disk in the array will have
40          * block numbered 'block'.
41          * '-1' means the parity block.
42          * '-2' means the Q syndrome.
43          */
44         int pd;
45
46         /* layout is not relevant for raid0 and raid4 */
47         if ((level == 0) ||
48             (level == 4))
49                 layout = 0;
50
51         switch(level*100 + layout) {
52         case 000:
53         case 400:
54         case 500 + ALGORITHM_PARITY_N:
55                 /* raid 4 isn't messed around by parity blocks */
56                 if (block == -1)
57                         return raid_disks-1; /* parity block */
58                 return block;
59         case 500 + ALGORITHM_LEFT_ASYMMETRIC:
60                 pd = (raid_disks-1) - stripe % raid_disks;
61                 if (block == -1)
62                         return pd;
63                 if (block >= pd)
64                         block++;
65                 return block;
66
67         case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
68                 pd = stripe % raid_disks;
69                 if (block == -1)
70                         return pd;
71                 if (block >= pd)
72                         block++;
73                 return block;
74
75         case 500 + ALGORITHM_LEFT_SYMMETRIC:
76                 pd = (raid_disks - 1) - stripe % raid_disks;
77                 if (block == -1)
78                         return pd;
79                 return (pd + 1 + block) % raid_disks;
80
81         case 500 + ALGORITHM_RIGHT_SYMMETRIC:
82                 pd = stripe % raid_disks;
83                 if (block == -1)
84                         return pd;
85                 return (pd + 1 + block) % raid_disks;
86
87         case 500 + ALGORITHM_PARITY_0:
88                 return block + 1;
89
90         case 600 + ALGORITHM_PARITY_N_6:
91                 if (block == -2)
92                         return raid_disks - 1;
93                 if (block == -1)
94                         return raid_disks - 2; /* parity block */
95                 return block;
96         case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
97                 if (block == -2)
98                         return raid_disks - 1;
99                 raid_disks--;
100                 pd = (raid_disks-1) - stripe % raid_disks;
101                 if (block == -1)
102                         return pd;
103                 if (block >= pd)
104                         block++;
105                 return block;
106
107         case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
108                 if (block == -2)
109                         return raid_disks - 1;
110                 raid_disks--;
111                 pd = stripe % raid_disks;
112                 if (block == -1)
113                         return pd;
114                 if (block >= pd)
115                         block++;
116                 return block;
117
118         case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
119                 if (block == -2)
120                         return raid_disks - 1;
121                 raid_disks--;
122                 pd = (raid_disks - 1) - stripe % raid_disks;
123                 if (block == -1)
124                         return pd;
125                 return (pd + 1 + block) % raid_disks;
126
127         case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
128                 if (block == -2)
129                         return raid_disks - 1;
130                 raid_disks--;
131                 pd = stripe % raid_disks;
132                 if (block == -1)
133                         return pd;
134                 return (pd + 1 + block) % raid_disks;
135
136         case 600 + ALGORITHM_PARITY_0_6:
137                 if (block == -2)
138                         return raid_disks - 1;
139                 return block + 1;
140
141         case 600 + ALGORITHM_PARITY_0:
142                 if (block == -1)
143                         return 0;
144                 if (block == -2)
145                         return 1;
146                 return block + 2;
147
148         case 600 + ALGORITHM_LEFT_ASYMMETRIC:
149                 pd = raid_disks - 1 - (stripe % raid_disks);
150                 if (block == -1)
151                         return pd;
152                 if (block == -2)
153                         return (pd+1) % raid_disks;
154                 if (pd == raid_disks - 1)
155                         return block+1;
156                 if (block >= pd)
157                         return block+2;
158                 return block;
159
160         case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
161                 /* Different order for calculating Q, otherwize same as ... */
162         case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
163                 pd = stripe % raid_disks;
164                 if (block == -1)
165                         return pd;
166                 if (block == -2)
167                         return (pd+1) % raid_disks;
168                 if (pd == raid_disks - 1)
169                         return block+1;
170                 if (block >= pd)
171                         return block+2;
172                 return block;
173
174         case 600 + ALGORITHM_LEFT_SYMMETRIC:
175                 pd = raid_disks - 1 - (stripe % raid_disks);
176                 if (block == -1)
177                         return pd;
178                 if (block == -2)
179                         return (pd+1) % raid_disks;
180                 return (pd + 2 + block) % raid_disks;
181
182         case 600 + ALGORITHM_RIGHT_SYMMETRIC:
183                 pd = stripe % raid_disks;
184                 if (block == -1)
185                         return pd;
186                 if (block == -2)
187                         return (pd+1) % raid_disks;
188                 return (pd + 2 + block) % raid_disks;
189
190         case 600 + ALGORITHM_ROTATING_N_RESTART:
191                 /* Same a left_asymmetric, by first stripe is
192                  * D D D P Q  rather than
193                  * Q D D D P
194                  */
195                 pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
196                 if (block == -1)
197                         return pd;
198                 if (block == -2)
199                         return (pd+1) % raid_disks;
200                 if (pd == raid_disks - 1)
201                         return block+1;
202                 if (block >= pd)
203                         return block+2;
204                 return block;
205
206         case 600 + ALGORITHM_ROTATING_N_CONTINUE:
207                 /* Same as left_symmetric but Q is before P */
208                 pd = raid_disks - 1 - (stripe % raid_disks);
209                 if (block == -1)
210                         return pd;
211                 if (block == -2)
212                         return (pd+raid_disks-1) % raid_disks;
213                 return (pd + 1 + block) % raid_disks;
214         }
215         return -1;
216 }
217
218 int is_ddf(int layout)
219 {
220         switch (layout)
221         {
222         default:
223                 return 0;
224         case ALGORITHM_ROTATING_N_CONTINUE:
225         case ALGORITHM_ROTATING_N_RESTART:
226         case ALGORITHM_ROTATING_ZERO_RESTART:
227                 return 1;
228         }
229 }
230
231 void xor_blocks(char *target, char **sources, int disks, int size)
232 {
233         int i, j;
234         /* Amazingly inefficient... */
235         for (i=0; i<size; i++) {
236                 char c = 0;
237                 for (j=0 ; j<disks; j++)
238                         c ^= sources[j][i];
239                 target[i] = c;
240         }
241 }
242
243 void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
244 {
245         int d, z;
246         uint8_t wq0, wp0, wd0, w10, w20;
247         for ( d = 0; d < size; d++) {
248                 wq0 = wp0 = sources[disks-1][d];
249                 for ( z = disks-2 ; z >= 0 ; z-- ) {
250                         wd0 = sources[z][d];
251                         wp0 ^= wd0;
252                         w20 = (wq0&0x80) ? 0xff : 0x00;
253                         w10 = (wq0 << 1) & 0xff;
254                         w20 &= 0x1d;
255                         w10 ^= w20;
256                         wq0 = w10 ^ wd0;
257                 }
258                 p[d] = wp0;
259                 q[d] = wq0;
260         }
261 }
262
263 /*
264  * The following was taken from linux/drivers/md/mktables.c, and modified
265  * to create in-memory tables rather than C code
266  */
267 static uint8_t gfmul(uint8_t a, uint8_t b)
268 {
269         uint8_t v = 0;
270
271         while (b) {
272                 if (b & 1)
273                         v ^= a;
274                 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
275                 b >>= 1;
276         }
277
278         return v;
279 }
280
281 static uint8_t gfpow(uint8_t a, int b)
282 {
283         uint8_t v = 1;
284
285         b %= 255;
286         if (b < 0)
287                 b += 255;
288
289         while (b) {
290                 if (b & 1)
291                         v = gfmul(v, a);
292                 a = gfmul(a, a);
293                 b >>= 1;
294         }
295
296         return v;
297 }
298
299 int tables_ready = 0;
300 uint8_t raid6_gfmul[256][256];
301 uint8_t raid6_gfexp[256];
302 uint8_t raid6_gfinv[256];
303 uint8_t raid6_gfexi[256];
304 uint8_t raid6_gflog[256];
305 uint8_t raid6_gfilog[256];
306 void make_tables(void)
307 {
308         int i, j;
309         uint8_t v;
310         uint32_t b, log;
311
312         /* Compute multiplication table */
313         for (i = 0; i < 256; i++)
314                 for (j = 0; j < 256; j++)
315                                 raid6_gfmul[i][j] = gfmul(i, j);
316
317         /* Compute power-of-2 table (exponent) */
318         v = 1;
319         for (i = 0; i < 256; i++) {
320                 raid6_gfexp[i] = v;
321                 v = gfmul(v, 2);
322                 if (v == 1)
323                         v = 0;  /* For entry 255, not a real entry */
324         }
325
326         /* Compute inverse table x^-1 == x^254 */
327         for (i = 0; i < 256; i++)
328                 raid6_gfinv[i] = gfpow(i, 254);
329
330         /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
331         for (i = 0; i < 256; i ++)
332                 raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
333
334         /* Compute log and inverse log */
335         /* Modified code from:
336          *    http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
337          */
338         b = 1;
339         raid6_gflog[0] = 0;
340         raid6_gfilog[255] = 0;
341
342         for (log = 0; log < 255; log++) {
343                 raid6_gflog[b] = (uint8_t) log;
344                 raid6_gfilog[log] = (uint8_t) b;
345                 b = b << 1;
346                 if (b & 256) b = b ^ 0435;
347         }
348
349         tables_ready = 1;
350 }
351
352 uint8_t *zero;
353 int zero_size;
354
355 void ensure_zero_has_size(int chunk_size)
356 {
357         if (zero == NULL || chunk_size > zero_size) {
358                 if (zero)
359                         free(zero);
360                 zero = xcalloc(1, chunk_size);
361                 zero_size = chunk_size;
362         }
363 }
364
365 /* Following was taken from linux/drivers/md/raid6recov.c */
366
367 /* Recover two failed data blocks. */
368
369 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
370                        uint8_t **ptrs, int neg_offset)
371 {
372         uint8_t *p, *q, *dp, *dq;
373         uint8_t px, qx, db;
374         const uint8_t *pbmul;   /* P multiplier table for B data */
375         const uint8_t *qmul;            /* Q multiplier table (for both) */
376
377         if (faila > failb) {
378                 int t = faila;
379                 faila = failb;
380                 failb = t;
381         }
382
383         if (neg_offset) {
384                 p = ptrs[-1];
385                 q = ptrs[-2];
386         } else {
387                 p = ptrs[disks-2];
388                 q = ptrs[disks-1];
389         }
390
391         /* Compute syndrome with zero for the missing data pages
392            Use the dead data pages as temporary storage for
393            delta p and delta q */
394         dp = ptrs[faila];
395         ptrs[faila] = zero;
396         dq = ptrs[failb];
397         ptrs[failb] = zero;
398
399         qsyndrome(dp, dq, ptrs, disks-2, bytes);
400
401         /* Restore pointer table */
402         ptrs[faila]   = dp;
403         ptrs[failb]   = dq;
404
405         /* Now, pick the proper data tables */
406         pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
407         qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
408
409         /* Now do it... */
410         while ( bytes-- ) {
411                 px    = *p ^ *dp;
412                 qx    = qmul[*q ^ *dq];
413                 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
414                 *dp++ = db ^ px; /* Reconstructed A */
415                 p++; q++;
416         }
417 }
418
419 /* Recover failure of one data block plus the P block */
420 void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
421                        int neg_offset)
422 {
423         uint8_t *p, *q, *dq;
424         const uint8_t *qmul;            /* Q multiplier table */
425
426         if (neg_offset) {
427                 p = ptrs[-1];
428                 q = ptrs[-2];
429         } else {
430                 p = ptrs[disks-2];
431                 q = ptrs[disks-1];
432         }
433
434         /* Compute syndrome with zero for the missing data page
435            Use the dead data page as temporary storage for delta q */
436         dq = ptrs[faila];
437         ptrs[faila] = zero;
438
439         qsyndrome(p, dq, ptrs, disks-2, bytes);
440
441         /* Restore pointer table */
442         ptrs[faila]   = dq;
443
444         /* Now, pick the proper data tables */
445         qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
446
447         /* Now do it... */
448         while ( bytes-- ) {
449                 *p++ ^= *dq = qmul[*q ^ *dq];
450                 q++; dq++;
451         }
452 }
453
454 /* Try to find out if a specific disk has a problem */
455 int raid6_check_disks(int data_disks, int start, int chunk_size,
456                       int level, int layout, int diskP, int diskQ,
457                       uint8_t *p, uint8_t *q, char **stripes)
458 {
459         int i;
460         int data_id, diskD;
461         uint8_t Px, Qx;
462         int curr_broken_disk = -1;
463         int prev_broken_disk = -1;
464         int broken_status = 0;
465
466         for(i = 0; i < chunk_size; i++) {
467                 Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
468                 Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
469
470                 if((Px != 0) && (Qx == 0))
471                         curr_broken_disk = diskP;
472
473                 if((Px == 0) && (Qx != 0))
474                         curr_broken_disk = diskQ;
475
476                 if((Px != 0) && (Qx != 0)) {
477                         data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
478                         if(data_id < 0) data_id += 255;
479                         diskD = geo_map(data_id, start/chunk_size,
480                                         data_disks + 2, level, layout);
481                         curr_broken_disk = diskD;
482                 }
483
484                 if((Px == 0) && (Qx == 0))
485                         curr_broken_disk = prev_broken_disk;
486
487                 if(curr_broken_disk >= data_disks + 2)
488                         broken_status = 2;
489
490                 switch(broken_status) {
491                 case 0:
492                         if(curr_broken_disk != -1) {
493                                 prev_broken_disk = curr_broken_disk;
494                                 broken_status = 1;
495                         }
496                         break;
497
498                 case 1:
499                         if(curr_broken_disk != prev_broken_disk)
500                                 broken_status = 2;
501                         break;
502
503                 case 2:
504                 default:
505                         curr_broken_disk = prev_broken_disk = -2;
506                         break;
507                 }
508         }
509
510         return curr_broken_disk;
511 }
512
513 /*******************************************************************************
514  * Function:    save_stripes
515  * Description:
516  *      Function reads data (only data without P and Q) from array and writes
517  * it to buf and opcjonaly to backup files
518  * Parameters:
519  *      source          : A list of 'fds' of the active disks.
520  *                        Some may be absent
521  *      offsets         : A list of offsets on disk belonging
522  *                       to the array [bytes]
523  *      raid_disks      : geometry: number of disks in the array
524  *      chunk_size      : geometry: chunk size [bytes]
525  *      level           : geometry: RAID level
526  *      layout          : geometry: layout
527  *      nwrites         : number of backup files
528  *      dest            : A list of 'fds' for mirrored targets
529  *                        (e.g. backup files). They are already seeked to right
530  *                        (write) location. If NULL, data will be wrote
531  *                        to the buf only
532  *      start           : start address of data to read (must be stripe-aligned)
533  *                        [bytes]
534  *      length  -       : length of data to read (must be stripe-aligned)
535  *                        [bytes]
536  *      buf             : buffer for data. It is large enough to hold
537  *                        one stripe. It is stripe aligned
538  * Returns:
539  *       0 : success
540  *      -1 : fail
541  ******************************************************************************/
542 int save_stripes(int *source, unsigned long long *offsets,
543                  int raid_disks, int chunk_size, int level, int layout,
544                  int nwrites, int *dest,
545                  unsigned long long start, unsigned long long length,
546                  char *buf)
547 {
548         int len;
549         int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
550         int disk;
551         int i;
552         unsigned long long length_test;
553
554         if (!tables_ready)
555                 make_tables();
556         ensure_zero_has_size(chunk_size);
557
558         len = data_disks * chunk_size;
559         length_test = length / len;
560         length_test *= len;
561
562         if (length != length_test) {
563                 dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
564                 dprintf("\tArea for saving stripes (length) = %llu\n", length);
565                 dprintf("\tWork step (len)                  = %i\n", len);
566                 dprintf("\tExpected save area (length_test) = %llu\n",
567                         length_test);
568                 abort();
569         }
570
571         while (length > 0) {
572                 int failed = 0;
573                 int fdisk[3], fblock[3];
574                 for (disk = 0; disk < raid_disks ; disk++) {
575                         unsigned long long offset;
576                         int dnum;
577
578                         offset = (start/chunk_size/data_disks)*chunk_size;
579                         dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
580                                        start/chunk_size/data_disks,
581                                        raid_disks, level, layout);
582                         if (dnum < 0) abort();
583                         if (source[dnum] < 0 ||
584                             lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 ||
585                             read(source[dnum], buf+disk * chunk_size, chunk_size)
586                             != chunk_size)
587                                 if (failed <= 2) {
588                                         fdisk[failed] = dnum;
589                                         fblock[failed] = disk;
590                                         failed++;
591                                 }
592                 }
593                 if (failed == 0 || fblock[0] >= data_disks)
594                         /* all data disks are good */
595                         ;
596                 else if (failed == 1 || fblock[1] >= data_disks+1) {
597                         /* one failed data disk and good parity */
598                         char *bufs[data_disks];
599                         for (i=0; i < data_disks; i++)
600                                 if (fblock[0] == i)
601                                         bufs[i] = buf + data_disks*chunk_size;
602                                 else
603                                         bufs[i] = buf + i*chunk_size;
604
605                         xor_blocks(buf + fblock[0]*chunk_size,
606                                    bufs, data_disks, chunk_size);
607                 } else if (failed > 2 || level != 6)
608                         /* too much failure */
609                         return -1;
610                 else {
611                         /* RAID6 computations needed. */
612                         uint8_t *bufs[data_disks+4];
613                         int qdisk;
614                         int syndrome_disks;
615                         disk = geo_map(-1, start/chunk_size/data_disks,
616                                        raid_disks, level, layout);
617                         qdisk = geo_map(-2, start/chunk_size/data_disks,
618                                        raid_disks, level, layout);
619                         if (is_ddf(layout)) {
620                                 /* q over 'raid_disks' blocks, in device order.
621                                  * 'p' and 'q' get to be all zero
622                                  */
623                                 for (i = 0; i < raid_disks; i++)
624                                         bufs[i] = zero;
625                                 for (i = 0; i < data_disks; i++) {
626                                         int dnum = geo_map(i,
627                                                            start/chunk_size/data_disks,
628                                                            raid_disks, level, layout);
629                                         int snum;
630                                         /* i is the logical block number, so is index to 'buf'.
631                                          * dnum is physical disk number
632                                          * and thus the syndrome number.
633                                          */
634                                         snum = dnum;
635                                         bufs[snum] = (uint8_t*)buf + chunk_size * i;
636                                 }
637                                 syndrome_disks = raid_disks;
638                         } else {
639                                 /* for md, q is over 'data_disks' blocks,
640                                  * starting immediately after 'q'
641                                  * Note that for the '_6' variety, the p block
642                                  * makes a hole that we need to be careful of.
643                                  */
644                                 int j;
645                                 int snum = 0;
646                                 for (j = 0; j < raid_disks; j++) {
647                                         int dnum = (qdisk + 1 + j) % raid_disks;
648                                         if (dnum == disk || dnum == qdisk)
649                                                 continue;
650                                         for (i = 0; i < data_disks; i++)
651                                                 if (geo_map(i,
652                                                             start/chunk_size/data_disks,
653                                                             raid_disks, level, layout) == dnum)
654                                                         break;
655                                         /* i is the logical block number, so is index to 'buf'.
656                                          * dnum is physical disk number
657                                          * snum is syndrome disk for which 0 is immediately after Q
658                                          */
659                                         bufs[snum] = (uint8_t*)buf + chunk_size * i;
660
661                                         if (fblock[0] == i)
662                                                 fdisk[0] = snum;
663                                         if (fblock[1] == i)
664                                                 fdisk[1] = snum;
665                                         snum++;
666                                 }
667
668                                 syndrome_disks = data_disks;
669                         }
670
671                         /* Place P and Q blocks at end of bufs */
672                         bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
673                         bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
674
675                         if (fblock[1] == data_disks)
676                                 /* One data failed, and parity failed */
677                                 raid6_datap_recov(syndrome_disks+2, chunk_size,
678                                                   fdisk[0], bufs, 0);
679                         else {
680                                 /* Two data blocks failed, P,Q OK */
681                                 raid6_2data_recov(syndrome_disks+2, chunk_size,
682                                                   fdisk[0], fdisk[1], bufs, 0);
683                         }
684                 }
685                 if (dest) {
686                         for (i = 0; i < nwrites; i++)
687                                 if (write(dest[i], buf, len) != len)
688                                         return -1;
689                 } else {
690                         /* build next stripe in buffer */
691                         buf += len;
692                 }
693                 length -= len;
694                 start += len;
695         }
696         return 0;
697 }
698
699 /* Restore data:
700  * We are given:
701  *  A list of 'fds' of the active disks. Some may be '-1' for not-available.
702  *  A geometry: raid_disks, chunk_size, level, layout
703  *  An 'fd' to read from.  It is already seeked to the right (Read) location.
704  *  A start and length.
705  * The length must be a multiple of the stripe size.
706  *
707  * We build a full stripe in memory and then write it out.
708  * We assume that there are enough working devices.
709  */
710 int restore_stripes(int *dest, unsigned long long *offsets,
711                     int raid_disks, int chunk_size, int level, int layout,
712                     int source, unsigned long long read_offset,
713                     unsigned long long start, unsigned long long length,
714                     char *src_buf)
715 {
716         char *stripe_buf;
717         char **stripes = xmalloc(raid_disks * sizeof(char*));
718         char **blocks = xmalloc(raid_disks * sizeof(char*));
719         int i;
720         int rv;
721
722         int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
723
724         if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
725                 stripe_buf = NULL;
726
727         if (zero == NULL || chunk_size > zero_size) {
728                 if (zero)
729                         free(zero);
730                 zero = xcalloc(1, chunk_size);
731                 zero_size = chunk_size;
732         }
733
734         if (stripe_buf == NULL || stripes == NULL || blocks == NULL
735             || zero == NULL) {
736                 rv = -2;
737                 goto abort;
738         }
739         for (i = 0; i < raid_disks; i++)
740                 stripes[i] = stripe_buf + i * chunk_size;
741         while (length > 0) {
742                 unsigned int len = data_disks * chunk_size;
743                 unsigned long long offset;
744                 int disk, qdisk;
745                 int syndrome_disks;
746                 if (length < len) {
747                         rv = -3;
748                         goto abort;
749                 }
750                 for (i = 0; i < data_disks; i++) {
751                         int disk = geo_map(i, start/chunk_size/data_disks,
752                                            raid_disks, level, layout);
753                         if (src_buf == NULL) {
754                                 /* read from file */
755                                 if (lseek64(source, read_offset, 0) !=
756                                          (off64_t)read_offset) {
757                                         rv = -1;
758                                         goto abort;
759                                 }
760                                 if (read(source,
761                                          stripes[disk],
762                                          chunk_size) != chunk_size) {
763                                         rv = -1;
764                                         goto abort;
765                                 }
766                         } else {
767                                 /* read from input buffer */
768                                 memcpy(stripes[disk],
769                                        src_buf + read_offset,
770                                        chunk_size);
771                         }
772                         read_offset += chunk_size;
773                 }
774                 /* We have the data, now do the parity */
775                 offset = (start/chunk_size/data_disks) * chunk_size;
776                 switch (level) {
777                 case 4:
778                 case 5:
779                         disk = geo_map(-1, start/chunk_size/data_disks,
780                                            raid_disks, level, layout);
781                         for (i = 0; i < data_disks; i++)
782                                 blocks[i] = stripes[(disk+1+i) % raid_disks];
783                         xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
784                         break;
785                 case 6:
786                         disk = geo_map(-1, start/chunk_size/data_disks,
787                                        raid_disks, level, layout);
788                         qdisk = geo_map(-2, start/chunk_size/data_disks,
789                                        raid_disks, level, layout);
790                         if (is_ddf(layout)) {
791                                 /* q over 'raid_disks' blocks, in device order.
792                                  * 'p' and 'q' get to be all zero
793                                  */
794                                 for (i = 0; i < raid_disks; i++)
795                                         if (i == disk || i == qdisk)
796                                                 blocks[i] = (char*)zero;
797                                         else
798                                                 blocks[i] = stripes[i];
799                                 syndrome_disks = raid_disks;
800                         } else {
801                                 /* for md, q is over 'data_disks' blocks,
802                                  * starting immediately after 'q'
803                                  */
804                                 for (i = 0; i < data_disks; i++)
805                                         blocks[i] = stripes[(qdisk+1+i) % raid_disks];
806
807                                 syndrome_disks = data_disks;
808                         }
809                         qsyndrome((uint8_t*)stripes[disk],
810                                   (uint8_t*)stripes[qdisk],
811                                   (uint8_t**)blocks,
812                                   syndrome_disks, chunk_size);
813                         break;
814                 }
815                 for (i=0; i < raid_disks ; i++)
816                         if (dest[i] >= 0) {
817                                 if (lseek64(dest[i],
818                                          offsets[i]+offset, 0) < 0) {
819                                         rv = -1;
820                                         goto abort;
821                                 }
822                                 if (write(dest[i], stripes[i],
823                                          chunk_size) != chunk_size) {
824                                         rv = -1;
825                                         goto abort;
826                                 }
827                         }
828                 length -= len;
829                 start += len;
830         }
831         rv = 0;
832
833 abort:
834         free(stripe_buf);
835         free(stripes);
836         free(blocks);
837         return rv;
838 }
839
840 #ifdef MAIN
841
842 int test_stripes(int *source, unsigned long long *offsets,
843                  int raid_disks, int chunk_size, int level, int layout,
844                  unsigned long long start, unsigned long long length)
845 {
846         /* ready the data and p (and q) blocks, and check we got them right */
847         char *stripe_buf = xmalloc(raid_disks * chunk_size);
848         char **stripes = xmalloc(raid_disks * sizeof(char*));
849         char **blocks = xmalloc(raid_disks * sizeof(char*));
850         uint8_t *p = xmalloc(chunk_size);
851         uint8_t *q = xmalloc(chunk_size);
852
853         int i;
854         int diskP, diskQ;
855         int data_disks = raid_disks - (level == 5 ? 1: 2);
856
857         if (!tables_ready)
858                 make_tables();
859
860         for ( i = 0 ; i < raid_disks ; i++)
861                 stripes[i] = stripe_buf + i * chunk_size;
862
863         while (length > 0) {
864                 int disk;
865
866                 for (i = 0 ; i < raid_disks ; i++) {
867                         lseek64(source[i], offsets[i]+start, 0);
868                         read(source[i], stripes[i], chunk_size);
869                 }
870                 for (i = 0 ; i < data_disks ; i++) {
871                         int disk = geo_map(i, start/chunk_size, raid_disks,
872                                            level, layout);
873                         blocks[i] = stripes[disk];
874                         printf("%d->%d\n", i, disk);
875                 }
876                 switch(level) {
877                 case 6:
878                         qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
879                         diskP = geo_map(-1, start/chunk_size, raid_disks,
880                                        level, layout);
881                         if (memcmp(p, stripes[diskP], chunk_size) != 0) {
882                                 printf("P(%d) wrong at %llu\n", diskP,
883                                        start / chunk_size);
884                         }
885                         diskQ = geo_map(-2, start/chunk_size, raid_disks,
886                                        level, layout);
887                         if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
888                                 printf("Q(%d) wrong at %llu\n", diskQ,
889                                        start / chunk_size);
890                         }
891                         disk = raid6_check_disks(data_disks, start, chunk_size,
892                                                  level, layout, diskP, diskQ,
893                                                  p, q, stripes);
894                         if(disk >= 0) {
895                           printf("Possible failed disk: %d\n", disk);
896                         }
897                         if(disk == -2) {
898                           printf("Failure detected, but disk unknown\n");
899                         }
900                         break;
901                 }
902                 length -= chunk_size;
903                 start += chunk_size;
904         }
905         return 0;
906 }
907
908 unsigned long long getnum(char *str, char **err)
909 {
910         char *e;
911         unsigned long long rv = strtoull(str, &e, 10);
912         if (e==str || *e) {
913                 *err = str;
914                 return 0;
915         }
916         return rv;
917 }
918
919 char const Name[] = "test_restripe";
920 int main(int argc, char *argv[])
921 {
922         /* save/restore file raid_disks chunk_size level layout start length devices...
923          */
924         int save;
925         int *fds;
926         char *file;
927         char *buf;
928         int storefd;
929         unsigned long long *offsets;
930         int raid_disks, chunk_size, level, layout;
931         unsigned long long start, length;
932         int i;
933
934         char *err = NULL;
935         if (argc < 10) {
936                 fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
937                 exit(1);
938         }
939         if (strcmp(argv[1], "save")==0)
940                 save = 1;
941         else if (strcmp(argv[1], "restore") == 0)
942                 save = 0;
943         else if (strcmp(argv[1], "test") == 0)
944                 save = 2;
945         else {
946                 fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
947                 exit(2);
948         }
949
950         file = argv[2];
951         raid_disks = getnum(argv[3], &err);
952         chunk_size = getnum(argv[4], &err);
953         level = getnum(argv[5], &err);
954         layout = getnum(argv[6], &err);
955         start = getnum(argv[7], &err);
956         length = getnum(argv[8], &err);
957         if (err) {
958                 fprintf(stderr, "test_stripe: Bad number: %s\n", err);
959                 exit(2);
960         }
961         if (argc != raid_disks + 9) {
962                 fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
963                         raid_disks, argc-9);
964                 exit(2);
965         }
966         fds = xmalloc(raid_disks * sizeof(*fds));
967         offsets = xcalloc(raid_disks, sizeof(*offsets));
968
969         storefd = open(file, O_RDWR);
970         if (storefd < 0) {
971                 perror(file);
972                 fprintf(stderr, "test_stripe: could not open %s.\n", file);
973                 exit(3);
974         }
975         for (i=0; i<raid_disks; i++) {
976                 char *p;
977                 p = strchr(argv[9+i], ':');
978
979                 if(p != NULL) {
980                         *p++ = '\0';
981                         offsets[i] = atoll(p) * 512;
982                 }
983
984                 fds[i] = open(argv[9+i], O_RDWR);
985                 if (fds[i] < 0) {
986                         perror(argv[9+i]);
987                         fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
988                         exit(3);
989                 }
990         }
991
992         buf = xmalloc(raid_disks * chunk_size);
993
994         if (save == 1) {
995                 int rv = save_stripes(fds, offsets,
996                                       raid_disks, chunk_size, level, layout,
997                                       1, &storefd,
998                                       start, length, buf);
999                 if (rv != 0) {
1000                         fprintf(stderr,
1001                                 "test_stripe: save_stripes returned %d\n", rv);
1002                         exit(1);
1003                 }
1004         } else if (save == 2) {
1005                 int rv = test_stripes(fds, offsets,
1006                                       raid_disks, chunk_size, level, layout,
1007                                       start, length);
1008                 if (rv != 0) {
1009                         fprintf(stderr,
1010                                 "test_stripe: test_stripes returned %d\n", rv);
1011                         exit(1);
1012                 }
1013         } else {
1014                 int rv = restore_stripes(fds, offsets,
1015                                          raid_disks, chunk_size, level, layout,
1016                                          storefd, 0ULL,
1017                                          start, length, NULL);
1018                 if (rv != 0) {
1019                         fprintf(stderr,
1020                                 "test_stripe: restore_stripes returned %d\n",
1021                                 rv);
1022                         exit(1);
1023                 }
1024         }
1025         exit(0);
1026 }
1027
1028 #endif /* MAIN */