2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@suse.de>
28 /* To restripe, we read from old geometry to a buffer, and
29 * read from buffer to new geometry.
30 * When reading, we might have missing devices and so could need
32 * When writing, we need to create correct parity and Q.
36 int geo_map(int block, unsigned long long stripe, int raid_disks,
37 int level, int layout)
39 /* On the given stripe, find which disk in the array will have
40 * block numbered 'block'.
41 * '-1' means the parity block.
42 * '-2' means the Q syndrome.
46 /* layout is not relevant for raid0 and raid4 */
51 switch(level*100 + layout) {
54 case 500 + ALGORITHM_PARITY_N:
55 /* raid 4 isn't messed around by parity blocks */
57 return raid_disks-1; /* parity block */
59 case 500 + ALGORITHM_LEFT_ASYMMETRIC:
60 pd = (raid_disks-1) - stripe % raid_disks;
67 case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
68 pd = stripe % raid_disks;
75 case 500 + ALGORITHM_LEFT_SYMMETRIC:
76 pd = (raid_disks - 1) - stripe % raid_disks;
79 return (pd + 1 + block) % raid_disks;
81 case 500 + ALGORITHM_RIGHT_SYMMETRIC:
82 pd = stripe % raid_disks;
85 return (pd + 1 + block) % raid_disks;
87 case 500 + ALGORITHM_PARITY_0:
90 case 600 + ALGORITHM_PARITY_N_6:
92 return raid_disks - 1;
94 return raid_disks - 2; /* parity block */
96 case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
98 return raid_disks - 1;
100 pd = (raid_disks-1) - stripe % raid_disks;
107 case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
109 return raid_disks - 1;
111 pd = stripe % raid_disks;
118 case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
120 return raid_disks - 1;
122 pd = (raid_disks - 1) - stripe % raid_disks;
125 return (pd + 1 + block) % raid_disks;
127 case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
129 return raid_disks - 1;
131 pd = stripe % raid_disks;
134 return (pd + 1 + block) % raid_disks;
136 case 600 + ALGORITHM_PARITY_0_6:
138 return raid_disks - 1;
141 case 600 + ALGORITHM_PARITY_0:
148 case 600 + ALGORITHM_LEFT_ASYMMETRIC:
149 pd = raid_disks - 1 - (stripe % raid_disks);
153 return (pd+1) % raid_disks;
154 if (pd == raid_disks - 1)
160 case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
161 /* Different order for calculating Q, otherwize same as ... */
162 case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
163 pd = stripe % raid_disks;
167 return (pd+1) % raid_disks;
168 if (pd == raid_disks - 1)
174 case 600 + ALGORITHM_LEFT_SYMMETRIC:
175 pd = raid_disks - 1 - (stripe % raid_disks);
179 return (pd+1) % raid_disks;
180 return (pd + 2 + block) % raid_disks;
182 case 600 + ALGORITHM_RIGHT_SYMMETRIC:
183 pd = stripe % raid_disks;
187 return (pd+1) % raid_disks;
188 return (pd + 2 + block) % raid_disks;
190 case 600 + ALGORITHM_ROTATING_N_RESTART:
191 /* Same a left_asymmetric, by first stripe is
192 * D D D P Q rather than
195 pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
199 return (pd+1) % raid_disks;
200 if (pd == raid_disks - 1)
206 case 600 + ALGORITHM_ROTATING_N_CONTINUE:
207 /* Same as left_symmetric but Q is before P */
208 pd = raid_disks - 1 - (stripe % raid_disks);
212 return (pd+raid_disks-1) % raid_disks;
213 return (pd + 1 + block) % raid_disks;
218 int is_ddf(int layout)
224 case ALGORITHM_ROTATING_N_CONTINUE:
225 case ALGORITHM_ROTATING_N_RESTART:
226 case ALGORITHM_ROTATING_ZERO_RESTART:
231 void xor_blocks(char *target, char **sources, int disks, int size)
234 /* Amazingly inefficient... */
235 for (i=0; i<size; i++) {
237 for (j=0 ; j<disks; j++)
243 void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
246 uint8_t wq0, wp0, wd0, w10, w20;
247 for ( d = 0; d < size; d++) {
248 wq0 = wp0 = sources[disks-1][d];
249 for ( z = disks-2 ; z >= 0 ; z-- ) {
252 w20 = (wq0&0x80) ? 0xff : 0x00;
253 w10 = (wq0 << 1) & 0xff;
264 * The following was taken from linux/drivers/md/mktables.c, and modified
265 * to create in-memory tables rather than C code
267 static uint8_t gfmul(uint8_t a, uint8_t b)
274 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
281 static uint8_t gfpow(uint8_t a, int b)
299 int tables_ready = 0;
300 uint8_t raid6_gfmul[256][256];
301 uint8_t raid6_gfexp[256];
302 uint8_t raid6_gfinv[256];
303 uint8_t raid6_gfexi[256];
304 uint8_t raid6_gflog[256];
305 uint8_t raid6_gfilog[256];
306 void make_tables(void)
312 /* Compute multiplication table */
313 for (i = 0; i < 256; i++)
314 for (j = 0; j < 256; j++)
315 raid6_gfmul[i][j] = gfmul(i, j);
317 /* Compute power-of-2 table (exponent) */
319 for (i = 0; i < 256; i++) {
323 v = 0; /* For entry 255, not a real entry */
326 /* Compute inverse table x^-1 == x^254 */
327 for (i = 0; i < 256; i++)
328 raid6_gfinv[i] = gfpow(i, 254);
330 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
331 for (i = 0; i < 256; i ++)
332 raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
334 /* Compute log and inverse log */
335 /* Modified code from:
336 * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
340 raid6_gfilog[255] = 0;
342 for (log = 0; log < 255; log++) {
343 raid6_gflog[b] = (uint8_t) log;
344 raid6_gfilog[log] = (uint8_t) b;
346 if (b & 256) b = b ^ 0435;
355 void ensure_zero_has_size(int chunk_size)
357 if (zero == NULL || chunk_size > zero_size) {
360 zero = xcalloc(1, chunk_size);
361 zero_size = chunk_size;
365 /* Following was taken from linux/drivers/md/raid6recov.c */
367 /* Recover two failed data blocks. */
369 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
370 uint8_t **ptrs, int neg_offset)
372 uint8_t *p, *q, *dp, *dq;
374 const uint8_t *pbmul; /* P multiplier table for B data */
375 const uint8_t *qmul; /* Q multiplier table (for both) */
391 /* Compute syndrome with zero for the missing data pages
392 Use the dead data pages as temporary storage for
393 delta p and delta q */
399 qsyndrome(dp, dq, ptrs, disks-2, bytes);
401 /* Restore pointer table */
405 /* Now, pick the proper data tables */
406 pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
407 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
413 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
414 *dp++ = db ^ px; /* Reconstructed A */
419 /* Recover failure of one data block plus the P block */
420 void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
424 const uint8_t *qmul; /* Q multiplier table */
434 /* Compute syndrome with zero for the missing data page
435 Use the dead data page as temporary storage for delta q */
439 qsyndrome(p, dq, ptrs, disks-2, bytes);
441 /* Restore pointer table */
444 /* Now, pick the proper data tables */
445 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
449 *p++ ^= *dq = qmul[*q ^ *dq];
454 /* Try to find out if a specific disk has a problem */
455 int raid6_check_disks(int data_disks, int start, int chunk_size,
456 int level, int layout, int diskP, int diskQ,
457 uint8_t *p, uint8_t *q, char **stripes)
462 int curr_broken_disk = -1;
463 int prev_broken_disk = -1;
464 int broken_status = 0;
466 for(i = 0; i < chunk_size; i++) {
467 Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
468 Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
470 if((Px != 0) && (Qx == 0))
471 curr_broken_disk = diskP;
473 if((Px == 0) && (Qx != 0))
474 curr_broken_disk = diskQ;
476 if((Px != 0) && (Qx != 0)) {
477 data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
478 if(data_id < 0) data_id += 255;
479 diskD = geo_map(data_id, start/chunk_size,
480 data_disks + 2, level, layout);
481 curr_broken_disk = diskD;
484 if((Px == 0) && (Qx == 0))
485 curr_broken_disk = prev_broken_disk;
487 if(curr_broken_disk >= data_disks + 2)
490 switch(broken_status) {
492 if(curr_broken_disk != -1) {
493 prev_broken_disk = curr_broken_disk;
499 if(curr_broken_disk != prev_broken_disk)
505 curr_broken_disk = prev_broken_disk = -2;
510 return curr_broken_disk;
513 /*******************************************************************************
514 * Function: save_stripes
516 * Function reads data (only data without P and Q) from array and writes
517 * it to buf and opcjonaly to backup files
519 * source : A list of 'fds' of the active disks.
521 * offsets : A list of offsets on disk belonging
522 * to the array [bytes]
523 * raid_disks : geometry: number of disks in the array
524 * chunk_size : geometry: chunk size [bytes]
525 * level : geometry: RAID level
526 * layout : geometry: layout
527 * nwrites : number of backup files
528 * dest : A list of 'fds' for mirrored targets
529 * (e.g. backup files). They are already seeked to right
530 * (write) location. If NULL, data will be wrote
532 * start : start address of data to read (must be stripe-aligned)
534 * length - : length of data to read (must be stripe-aligned)
536 * buf : buffer for data. It is large enough to hold
537 * one stripe. It is stripe aligned
541 ******************************************************************************/
542 int save_stripes(int *source, unsigned long long *offsets,
543 int raid_disks, int chunk_size, int level, int layout,
544 int nwrites, int *dest,
545 unsigned long long start, unsigned long long length,
549 int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
552 unsigned long long length_test;
556 ensure_zero_has_size(chunk_size);
558 len = data_disks * chunk_size;
559 length_test = length / len;
562 if (length != length_test) {
563 dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
564 dprintf("\tArea for saving stripes (length) = %llu\n", length);
565 dprintf("\tWork step (len) = %i\n", len);
566 dprintf("\tExpected save area (length_test) = %llu\n",
573 int fdisk[3], fblock[3];
574 for (disk = 0; disk < raid_disks ; disk++) {
575 unsigned long long offset;
578 offset = (start/chunk_size/data_disks)*chunk_size;
579 dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
580 start/chunk_size/data_disks,
581 raid_disks, level, layout);
582 if (dnum < 0) abort();
583 if (source[dnum] < 0 ||
584 lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 ||
585 read(source[dnum], buf+disk * chunk_size, chunk_size)
588 fdisk[failed] = dnum;
589 fblock[failed] = disk;
593 if (failed == 0 || fblock[0] >= data_disks)
594 /* all data disks are good */
596 else if (failed == 1 || fblock[1] >= data_disks+1) {
597 /* one failed data disk and good parity */
598 char *bufs[data_disks];
599 for (i=0; i < data_disks; i++)
601 bufs[i] = buf + data_disks*chunk_size;
603 bufs[i] = buf + i*chunk_size;
605 xor_blocks(buf + fblock[0]*chunk_size,
606 bufs, data_disks, chunk_size);
607 } else if (failed > 2 || level != 6)
608 /* too much failure */
611 /* RAID6 computations needed. */
612 uint8_t *bufs[data_disks+4];
615 disk = geo_map(-1, start/chunk_size/data_disks,
616 raid_disks, level, layout);
617 qdisk = geo_map(-2, start/chunk_size/data_disks,
618 raid_disks, level, layout);
619 if (is_ddf(layout)) {
620 /* q over 'raid_disks' blocks, in device order.
621 * 'p' and 'q' get to be all zero
623 for (i = 0; i < raid_disks; i++)
625 for (i = 0; i < data_disks; i++) {
626 int dnum = geo_map(i,
627 start/chunk_size/data_disks,
628 raid_disks, level, layout);
630 /* i is the logical block number, so is index to 'buf'.
631 * dnum is physical disk number
632 * and thus the syndrome number.
635 bufs[snum] = (uint8_t*)buf + chunk_size * i;
637 syndrome_disks = raid_disks;
639 /* for md, q is over 'data_disks' blocks,
640 * starting immediately after 'q'
641 * Note that for the '_6' variety, the p block
642 * makes a hole that we need to be careful of.
646 for (j = 0; j < raid_disks; j++) {
647 int dnum = (qdisk + 1 + j) % raid_disks;
648 if (dnum == disk || dnum == qdisk)
650 for (i = 0; i < data_disks; i++)
652 start/chunk_size/data_disks,
653 raid_disks, level, layout) == dnum)
655 /* i is the logical block number, so is index to 'buf'.
656 * dnum is physical disk number
657 * snum is syndrome disk for which 0 is immediately after Q
659 bufs[snum] = (uint8_t*)buf + chunk_size * i;
668 syndrome_disks = data_disks;
671 /* Place P and Q blocks at end of bufs */
672 bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
673 bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
675 if (fblock[1] == data_disks)
676 /* One data failed, and parity failed */
677 raid6_datap_recov(syndrome_disks+2, chunk_size,
680 /* Two data blocks failed, P,Q OK */
681 raid6_2data_recov(syndrome_disks+2, chunk_size,
682 fdisk[0], fdisk[1], bufs, 0);
686 for (i = 0; i < nwrites; i++)
687 if (write(dest[i], buf, len) != len)
690 /* build next stripe in buffer */
701 * A list of 'fds' of the active disks. Some may be '-1' for not-available.
702 * A geometry: raid_disks, chunk_size, level, layout
703 * An 'fd' to read from. It is already seeked to the right (Read) location.
704 * A start and length.
705 * The length must be a multiple of the stripe size.
707 * We build a full stripe in memory and then write it out.
708 * We assume that there are enough working devices.
710 int restore_stripes(int *dest, unsigned long long *offsets,
711 int raid_disks, int chunk_size, int level, int layout,
712 int source, unsigned long long read_offset,
713 unsigned long long start, unsigned long long length,
717 char **stripes = xmalloc(raid_disks * sizeof(char*));
718 char **blocks = xmalloc(raid_disks * sizeof(char*));
722 int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
724 if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
727 if (zero == NULL || chunk_size > zero_size) {
730 zero = xcalloc(1, chunk_size);
731 zero_size = chunk_size;
734 if (stripe_buf == NULL || stripes == NULL || blocks == NULL
739 for (i = 0; i < raid_disks; i++)
740 stripes[i] = stripe_buf + i * chunk_size;
742 unsigned int len = data_disks * chunk_size;
743 unsigned long long offset;
750 for (i = 0; i < data_disks; i++) {
751 int disk = geo_map(i, start/chunk_size/data_disks,
752 raid_disks, level, layout);
753 if (src_buf == NULL) {
755 if (lseek64(source, read_offset, 0) !=
756 (off64_t)read_offset) {
762 chunk_size) != chunk_size) {
767 /* read from input buffer */
768 memcpy(stripes[disk],
769 src_buf + read_offset,
772 read_offset += chunk_size;
774 /* We have the data, now do the parity */
775 offset = (start/chunk_size/data_disks) * chunk_size;
779 disk = geo_map(-1, start/chunk_size/data_disks,
780 raid_disks, level, layout);
781 for (i = 0; i < data_disks; i++)
782 blocks[i] = stripes[(disk+1+i) % raid_disks];
783 xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
786 disk = geo_map(-1, start/chunk_size/data_disks,
787 raid_disks, level, layout);
788 qdisk = geo_map(-2, start/chunk_size/data_disks,
789 raid_disks, level, layout);
790 if (is_ddf(layout)) {
791 /* q over 'raid_disks' blocks, in device order.
792 * 'p' and 'q' get to be all zero
794 for (i = 0; i < raid_disks; i++)
795 if (i == disk || i == qdisk)
796 blocks[i] = (char*)zero;
798 blocks[i] = stripes[i];
799 syndrome_disks = raid_disks;
801 /* for md, q is over 'data_disks' blocks,
802 * starting immediately after 'q'
804 for (i = 0; i < data_disks; i++)
805 blocks[i] = stripes[(qdisk+1+i) % raid_disks];
807 syndrome_disks = data_disks;
809 qsyndrome((uint8_t*)stripes[disk],
810 (uint8_t*)stripes[qdisk],
812 syndrome_disks, chunk_size);
815 for (i=0; i < raid_disks ; i++)
818 offsets[i]+offset, 0) < 0) {
822 if (write(dest[i], stripes[i],
823 chunk_size) != chunk_size) {
842 int test_stripes(int *source, unsigned long long *offsets,
843 int raid_disks, int chunk_size, int level, int layout,
844 unsigned long long start, unsigned long long length)
846 /* ready the data and p (and q) blocks, and check we got them right */
847 char *stripe_buf = xmalloc(raid_disks * chunk_size);
848 char **stripes = xmalloc(raid_disks * sizeof(char*));
849 char **blocks = xmalloc(raid_disks * sizeof(char*));
850 uint8_t *p = xmalloc(chunk_size);
851 uint8_t *q = xmalloc(chunk_size);
855 int data_disks = raid_disks - (level == 5 ? 1: 2);
860 for ( i = 0 ; i < raid_disks ; i++)
861 stripes[i] = stripe_buf + i * chunk_size;
866 for (i = 0 ; i < raid_disks ; i++) {
867 lseek64(source[i], offsets[i]+start, 0);
868 read(source[i], stripes[i], chunk_size);
870 for (i = 0 ; i < data_disks ; i++) {
871 int disk = geo_map(i, start/chunk_size, raid_disks,
873 blocks[i] = stripes[disk];
874 printf("%d->%d\n", i, disk);
878 qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
879 diskP = geo_map(-1, start/chunk_size, raid_disks,
881 if (memcmp(p, stripes[diskP], chunk_size) != 0) {
882 printf("P(%d) wrong at %llu\n", diskP,
885 diskQ = geo_map(-2, start/chunk_size, raid_disks,
887 if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
888 printf("Q(%d) wrong at %llu\n", diskQ,
891 disk = raid6_check_disks(data_disks, start, chunk_size,
892 level, layout, diskP, diskQ,
895 printf("Possible failed disk: %d\n", disk);
898 printf("Failure detected, but disk unknown\n");
902 length -= chunk_size;
908 unsigned long long getnum(char *str, char **err)
911 unsigned long long rv = strtoull(str, &e, 10);
919 char const Name[] = "test_restripe";
920 int main(int argc, char *argv[])
922 /* save/restore file raid_disks chunk_size level layout start length devices...
929 unsigned long long *offsets;
930 int raid_disks, chunk_size, level, layout;
931 unsigned long long start, length;
936 fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
939 if (strcmp(argv[1], "save")==0)
941 else if (strcmp(argv[1], "restore") == 0)
943 else if (strcmp(argv[1], "test") == 0)
946 fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
951 raid_disks = getnum(argv[3], &err);
952 chunk_size = getnum(argv[4], &err);
953 level = getnum(argv[5], &err);
954 layout = getnum(argv[6], &err);
955 start = getnum(argv[7], &err);
956 length = getnum(argv[8], &err);
958 fprintf(stderr, "test_stripe: Bad number: %s\n", err);
961 if (argc != raid_disks + 9) {
962 fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
966 fds = xmalloc(raid_disks * sizeof(*fds));
967 offsets = xcalloc(raid_disks, sizeof(*offsets));
969 storefd = open(file, O_RDWR);
972 fprintf(stderr, "test_stripe: could not open %s.\n", file);
975 for (i=0; i<raid_disks; i++) {
977 p = strchr(argv[9+i], ':');
981 offsets[i] = atoll(p) * 512;
984 fds[i] = open(argv[9+i], O_RDWR);
987 fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
992 buf = xmalloc(raid_disks * chunk_size);
995 int rv = save_stripes(fds, offsets,
996 raid_disks, chunk_size, level, layout,
1001 "test_stripe: save_stripes returned %d\n", rv);
1004 } else if (save == 2) {
1005 int rv = test_stripes(fds, offsets,
1006 raid_disks, chunk_size, level, layout,
1010 "test_stripe: test_stripes returned %d\n", rv);
1014 int rv = restore_stripes(fds, offsets,
1015 raid_disks, chunk_size, level, layout,
1017 start, length, NULL);
1020 "test_stripe: restore_stripes returned %d\n",