]> git.neil.brown.name Git - mdadm.git/blob - Manage.c
Release mdadm-4.0
[mdadm.git] / Manage.c
1 /*
2  * mdadm - manage Linux "md" devices aka RAID arrays.
3  *
4  * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
5  *
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2 of the License, or
10  *    (at your option) any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  *    Author: Neil Brown
22  *    Email: <neilb@suse.de>
23  */
24
25 #include "mdadm.h"
26 #include "md_u.h"
27 #include "md_p.h"
28 #include <ctype.h>
29
30 #define REGISTER_DEV            _IO (MD_MAJOR, 1)
31 #define START_MD                _IO (MD_MAJOR, 2)
32 #define STOP_MD                 _IO (MD_MAJOR, 3)
33
34 int Manage_ro(char *devname, int fd, int readonly)
35 {
36         /* switch to readonly or rw
37          *
38          * requires >= 0.90.0
39          * first check that array is runing
40          * use RESTART_ARRAY_RW or STOP_ARRAY_RO
41          *
42          */
43         mdu_array_info_t array;
44 #ifndef MDASSEMBLE
45         struct mdinfo *mdi;
46 #endif
47         int rv = 0;
48
49         if (md_get_version(fd) < 9000) {
50                 pr_err("need md driver version 0.90.0 or later\n");
51                 return 1;
52         }
53 #ifndef MDASSEMBLE
54         /* If this is an externally-managed array, we need to modify the
55          * metadata_version so that mdmon doesn't undo our change.
56          */
57         mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
58         if (mdi &&
59             mdi->array.major_version == -1 &&
60             is_subarray(mdi->text_version)) {
61                 char vers[64];
62                 strcpy(vers, "external:");
63                 strcat(vers, mdi->text_version);
64                 if (readonly > 0) {
65                         int rv;
66                         /* We set readonly ourselves. */
67                         vers[9] = '-';
68                         sysfs_set_str(mdi, NULL, "metadata_version", vers);
69
70                         close(fd);
71                         rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
72
73                         if (rv < 0) {
74                                 pr_err("failed to set readonly for %s: %s\n",
75                                         devname, strerror(errno));
76
77                                 vers[9] = mdi->text_version[0];
78                                 sysfs_set_str(mdi, NULL, "metadata_version", vers);
79                                 rv = 1;
80                                 goto out;
81                         }
82                 } else {
83                         char *cp;
84                         /* We cannot set read/write - must signal mdmon */
85                         vers[9] = '/';
86                         sysfs_set_str(mdi, NULL, "metadata_version", vers);
87
88                         cp = strchr(vers+10, '/');
89                         if (cp)
90                                 *cp = 0;
91                         ping_monitor(vers+10);
92                         if (mdi->array.level <= 0)
93                                 sysfs_set_str(mdi, NULL, "array_state", "active");
94                 }
95                 goto out;
96         }
97 #endif
98         if (ioctl(fd, GET_ARRAY_INFO, &array)) {
99                 pr_err("%s does not appear to be active.\n",
100                         devname);
101                 rv = 1;
102                 goto out;
103         }
104
105         if (readonly > 0) {
106                 if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
107                         pr_err("failed to set readonly for %s: %s\n",
108                                 devname, strerror(errno));
109                         rv = 1;
110                         goto out;
111                 }
112         } else if (readonly < 0) {
113                 if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
114                         pr_err("failed to set writable for %s: %s\n",
115                                 devname, strerror(errno));
116                         rv = 1;
117                         goto out;
118                 }
119         }
120 out:
121 #ifndef MDASSEMBLE
122         sysfs_free(mdi);
123 #endif
124         return rv;
125 }
126
127 #ifndef MDASSEMBLE
128
129 static void remove_devices(char *devnm, char *path)
130 {
131         /*
132          * Remove names at 'path' - possibly with
133          * partition suffixes - which link to the 'standard'
134          * name for devnm.  These were probably created
135          * by mdadm when the array was assembled.
136          */
137         char base[40];
138         char *path2;
139         char link[1024];
140         int n;
141         int part;
142         char *be;
143         char *pe;
144
145         if (!path)
146                 return;
147
148         sprintf(base, "/dev/%s", devnm);
149         be = base + strlen(base);
150
151         path2 = xmalloc(strlen(path)+20);
152         strcpy(path2, path);
153         pe = path2 + strlen(path2);
154
155         for (part = 0; part < 16; part++) {
156                 if (part) {
157                         sprintf(be, "p%d", part);
158
159                         if (isdigit(pe[-1]))
160                                 sprintf(pe, "p%d", part);
161                         else
162                                 sprintf(pe, "%d", part);
163                 }
164                 n = readlink(path2, link, sizeof(link));
165                 if (n > 0 && (int)strlen(base) == n &&
166                     strncmp(link, base, n) == 0)
167                         unlink(path2);
168         }
169         free(path2);
170 }
171
172 int Manage_run(char *devname, int fd, struct context *c)
173 {
174         /* Run the array.  Array must already be configured
175          *  Requires >= 0.90.0
176          */
177         char nm[32], *nmp;
178
179         if (md_get_version(fd) < 9000) {
180                 pr_err("need md driver version 0.90.0 or later\n");
181                 return 1;
182         }
183         nmp = fd2devnm(fd);
184         if (!nmp) {
185                 pr_err("Cannot find %s in sysfs!!\n", devname);
186                 return 1;
187         }
188         strcpy(nm, nmp);
189         return IncrementalScan(c, nm);
190 }
191
192 int Manage_stop(char *devname, int fd, int verbose, int will_retry)
193 {
194         /* Stop the array.  Array must already be configured
195          * 'will_retry' means that error messages are not wanted.
196          */
197         int rv = 0;
198         struct map_ent *map = NULL;
199         struct mdinfo *mdi;
200         char devnm[32];
201         char container[32];
202         int err;
203         int count;
204         char buf[32];
205         unsigned long long rd1, rd2;
206
207         if (will_retry && verbose == 0)
208                 verbose = -1;
209
210         if (md_get_version(fd) < 9000) {
211                 if (ioctl(fd, STOP_MD, 0) == 0)
212                         return 0;
213                 pr_err("stopping device %s failed: %s\n",
214                        devname, strerror(errno));
215                 return 1;
216         }
217
218         strcpy(devnm, fd2devnm(fd));
219         /* Get EXCL access first.  If this fails, then attempting
220          * to stop is probably a bad idea.
221          */
222         mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
223         if (mdi && is_subarray(mdi->text_version)) {
224                 char *sl;
225                 strncpy(container, mdi->text_version+1, sizeof(container));
226                 container[sizeof(container)-1] = 0;
227                 sl = strchr(container, '/');
228                 if (sl)
229                         *sl = 0;
230         } else
231                 container[0] = 0;
232         close(fd);
233         count = 5;
234         while (((fd = ((devname[0] == '/')
235                        ?open(devname, O_RDONLY|O_EXCL)
236                        :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
237                 || strcmp(fd2devnm(fd), devnm) != 0)
238                && container[0]
239                && mdmon_running(container)
240                && count) {
241                 /* Can't open, so something might be wrong.  However it
242                  * is a container, so we might be racing with mdmon, so
243                  * retry for a bit.
244                  */
245                 if (fd >= 0)
246                         close(fd);
247                 flush_mdmon(container);
248                 count--;
249         }
250         if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
251                 if (fd >= 0)
252                         close(fd);
253                 if (verbose >= 0)
254                         pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
255                                devname);
256                 return 1;
257         }
258         /* If this is an mdmon managed array, just write 'inactive'
259          * to the array state and let mdmon clear up.
260          */
261         if (mdi &&
262             mdi->array.level > 0 &&
263             is_subarray(mdi->text_version)) {
264                 int err;
265                 /* This is mdmon managed. */
266                 close(fd);
267
268                 /* As we had an O_EXCL open, any use of the device
269                  * which blocks STOP_ARRAY is probably a transient use,
270                  * so it is reasonable to retry for a while - 5 seconds.
271                  */
272                 count = 25;
273                 while (count &&
274                        (err = sysfs_set_str(mdi, NULL,
275                                             "array_state",
276                                             "inactive")) < 0
277                        && errno == EBUSY) {
278                         usleep(200000);
279                         count--;
280                 }
281                 if (err) {
282                         if (verbose >= 0)
283                                 pr_err("failed to stop array %s: %s\n",
284                                        devname, strerror(errno));
285                         rv = 1;
286                         goto out;
287                 }
288
289                 /* Give monitor a chance to act */
290                 ping_monitor(mdi->text_version);
291
292                 fd = open_dev_excl(devnm);
293                 if (fd < 0) {
294                         if (verbose >= 0)
295                                 pr_err("failed to completely stop %s: Device is busy\n",
296                                        devname);
297                         rv = 1;
298                         goto out;
299                 }
300         } else if (mdi &&
301                    mdi->array.major_version == -1 &&
302                    mdi->array.minor_version == -2 &&
303                    !is_subarray(mdi->text_version)) {
304                 struct mdstat_ent *mds, *m;
305                 /* container, possibly mdmon-managed.
306                  * Make sure mdmon isn't opening it, which
307                  * would interfere with the 'stop'
308                  */
309                 ping_monitor(mdi->sys_name);
310
311                 /* now check that there are no existing arrays
312                  * which are members of this array
313                  */
314                 mds = mdstat_read(0, 0);
315                 for (m = mds; m; m = m->next)
316                         if (m->metadata_version &&
317                             strncmp(m->metadata_version, "external:", 9)==0 &&
318                             metadata_container_matches(m->metadata_version+9,
319                                                        devnm)) {
320                                 if (verbose >= 0)
321                                         pr_err("Cannot stop container %s: member %s still active\n",
322                                                devname, m->devnm);
323                                 free_mdstat(mds);
324                                 rv = 1;
325                                 goto out;
326                         }
327         }
328
329         /* If the array is undergoing a reshape which changes the number
330          * of devices, then it would be nice to stop it at a point where
331          * it has completed a full number of stripes in both old and
332          * new layouts as this will allow the reshape to be reverted.
333          * So if 'sync_action' is "reshape" and 'raid_disks' shows two
334          * different numbers, then
335          *  - freeze reshape
336          *  - set sync_max to next multiple of both data_disks and
337          *    chunk sizes (or next but one)
338          *  - unfreeze reshape
339          *  - wait on 'sync_completed' for that point to be reached.
340          */
341         if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
342             sysfs_attribute_available(mdi, NULL, "sync_action") &&
343             sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
344             sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
345             strcmp(buf, "reshape\n") == 0 &&
346             sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
347                 unsigned long long position, curr;
348                 unsigned long long chunk1, chunk2;
349                 unsigned long long rddiv, chunkdiv;
350                 unsigned long long sectors;
351                 unsigned long long sync_max, old_sync_max;
352                 unsigned long long completed;
353                 int backwards = 0;
354                 int delay;
355                 int scfd;
356
357                 delay = 40;
358                 while (rd1 > rd2 && delay > 0 &&
359                        sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
360                         /* must be in the critical section - wait a bit */
361                         delay -= 1;
362                         usleep(100000);
363                 }
364
365                 if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
366                         goto done;
367                 /* Array is frozen */
368
369                 rd1 -= mdi->array.level == 6 ? 2 : 1;
370                 rd2 -= mdi->array.level == 6 ? 2 : 1;
371                 sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
372                 if (strncmp(buf, "back", 4) == 0)
373                         backwards = 1;
374                 if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
375                         /* reshape must have finished now */
376                         sysfs_set_str(mdi, NULL, "sync_action", "idle");
377                         goto done;
378                 }
379                 sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
380                 chunk1 /= 512;
381                 chunk2 /= 512;
382                 rddiv = GCD(rd1, rd2);
383                 chunkdiv = GCD(chunk1, chunk2);
384                 sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
385
386                 if (backwards) {
387                         /* Need to subtract 'reshape_position' from
388                          * array size to get equivalent of sync_max.
389                          * Size calculation based on raid5_size in kernel.
390                          */
391                         unsigned long long size = mdi->component_size;
392                         size &= ~(chunk1-1);
393                         size &= ~(chunk2-1);
394                         /* rd1 must be smaller */
395                         /* Reshape may have progressed further backwards than
396                          * recorded, so target even further back (hence "-1")
397                          */
398                         position = (position / sectors - 1) * sectors;
399                         /* rd1 is always the conversion factor between 'sync'
400                          * position and 'reshape' position.
401                          * We read 1 "new" stripe worth of data from where-ever,
402                          * and when write out that full stripe.
403                          */
404                         sync_max = size - position/rd1;
405                 } else {
406                         /* Reshape will very likely be beyond position, and it may
407                          * be too late to stop at '+1', so aim for '+2'
408                          */
409                         position = (position / sectors + 2) * sectors;
410                         sync_max = position/rd1;
411                 }
412                 if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
413                         old_sync_max = mdi->component_size;
414                 /* Must not advance sync_max as that could confuse
415                  * the reshape monitor */
416                 if (sync_max < old_sync_max)
417                         sysfs_set_num(mdi, NULL, "sync_max", sync_max);
418                 sysfs_set_str(mdi, NULL, "sync_action", "idle");
419
420                 /* That should have set things going again.  Now we
421                  * wait a little while (3 second max) for sync_completed
422                  * to reach the target.
423                  * The reshape process can block for 500msec if
424                  * the sync speed limit is hit, so we need to wait
425                  * a lot longer than that. 1 second is usually
426                  * enough.  3 is safe.
427                  */
428                 delay = 3000;
429                 scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
430                 while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
431                         unsigned long long max_completed;
432                         sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
433                         sysfs_fd_get_str(scfd, buf, sizeof(buf));
434                         if (strncmp(buf, "none", 4) == 0) {
435                                 /* Either reshape has aborted, or hasn't
436                                  * quite started yet.  Wait a bit and
437                                  * check  'sync_action' to see.
438                                  */
439                                 usleep(10000);
440                                 sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
441                                 if (strncmp(buf, "reshape", 7) != 0)
442                                         break;
443                         }
444
445                         if (sysfs_fd_get_two(scfd, &completed,
446                                              &max_completed) == 2 &&
447                             /* 'completed' sometimes reads as max-uulong */
448                             completed < max_completed &&
449                             (completed > sync_max ||
450                              (completed == sync_max && curr != position))) {
451                                 while (completed > sync_max) {
452                                         sync_max += sectors / rd1;
453                                         if (backwards)
454                                                 position -= sectors;
455                                         else
456                                                 position += sectors;
457                                 }
458                                 if (sync_max < old_sync_max)
459                                         sysfs_set_num(mdi, NULL, "sync_max", sync_max);
460                         }
461
462                         if (!backwards && curr >= position)
463                                 break;
464                         if (backwards && curr <= position)
465                                 break;
466                         sysfs_wait(scfd, &delay);
467                 }
468                 if (scfd >= 0)
469                         close(scfd);
470
471         }
472 done:
473
474         /* As we have an O_EXCL open, any use of the device
475          * which blocks STOP_ARRAY is probably a transient use,
476          * so it is reasonable to retry for a while - 5 seconds.
477          */
478         count = 25; err = 0;
479         while (count && fd >= 0
480                && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
481                && errno == EBUSY) {
482                 usleep(200000);
483                 count --;
484         }
485         if (fd >= 0 && err) {
486                 if (verbose >= 0) {
487                         pr_err("failed to stop array %s: %s\n",
488                                devname, strerror(errno));
489                         if (errno == EBUSY)
490                                 cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
491                 }
492                 rv = 1;
493                 goto out;
494         }
495
496         if (get_linux_version() < 2006028) {
497                 /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
498                  * was stopped, so We'll do it here just to be sure.  Drop any
499                  * partitions as well...
500                  */
501                 if (fd >= 0)
502                         ioctl(fd, BLKRRPART, 0);
503                 if (mdi)
504                         sysfs_uevent(mdi, "change");
505         }
506
507         if (devnm[0] && use_udev()) {
508                 struct map_ent *mp = map_by_devnm(&map, devnm);
509                 remove_devices(devnm, mp ? mp->path : NULL);
510         }
511
512         if (verbose >= 0)
513                 pr_err("stopped %s\n", devname);
514         map_lock(&map);
515         map_remove(&map, devnm);
516         map_unlock(&map);
517 out:
518         sysfs_free(mdi);
519
520         return rv;
521 }
522
523 static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
524 {
525         struct mddev_dev *new;
526         new = xmalloc(sizeof(*new));
527         memset(new, 0, sizeof(*new));
528         new->devname = xstrdup(name);
529         new->disposition = disp;
530         new->next = dv->next;
531         dv->next = new;
532         return new;
533 }
534
535 static void add_faulty(struct mddev_dev *dv, int fd, char disp)
536 {
537         mdu_array_info_t array;
538         mdu_disk_info_t disk;
539         int remaining_disks;
540         int i;
541
542         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
543                 return;
544
545         remaining_disks = array.nr_disks;
546         for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
547                 char buf[40];
548                 disk.number = i;
549                 if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
550                         continue;
551                 if (disk.major == 0 && disk.minor == 0)
552                         continue;
553                 remaining_disks--;
554                 if ((disk.state & 1) == 0) /* not faulty */
555                         continue;
556                 sprintf(buf, "%d:%d", disk.major, disk.minor);
557                 dv = add_one(dv, buf, disp);
558         }
559 }
560
561 static void add_detached(struct mddev_dev *dv, int fd, char disp)
562 {
563         mdu_array_info_t array;
564         mdu_disk_info_t disk;
565         int remaining_disks;
566         int i;
567
568         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
569                 return;
570
571         remaining_disks = array.nr_disks;
572         for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
573                 char buf[40];
574                 int sfd;
575                 disk.number = i;
576                 if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
577                         continue;
578                 if (disk.major == 0 && disk.minor == 0)
579                         continue;
580                 remaining_disks--;
581                 if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
582                         continue;
583                 sprintf(buf, "%d:%d", disk.major, disk.minor);
584                 sfd = dev_open(buf, O_RDONLY);
585                 if (sfd >= 0) {
586                         /* Not detached */
587                         close(sfd);
588                         continue;
589                 }
590                 if (errno != ENXIO)
591                         /* Probably not detached */
592                         continue;
593                 dv = add_one(dv, buf, disp);
594         }
595 }
596
597 static void add_set(struct mddev_dev *dv, int fd, char set_char)
598 {
599         mdu_array_info_t array;
600         mdu_disk_info_t disk;
601         int remaining_disks;
602         int copies, set;
603         int i;
604
605         if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
606                 return;
607         if (array.level != 10)
608                 return;
609         copies = ((array.layout & 0xff) *
610                   ((array.layout >> 8) & 0xff));
611         if (array.raid_disks % copies)
612                 return;
613
614         remaining_disks = array.nr_disks;
615         for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
616                 char buf[40];
617                 disk.number = i;
618                 if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
619                         continue;
620                 if (disk.major == 0 && disk.minor == 0)
621                         continue;
622                 remaining_disks--;
623                 set = disk.raid_disk % copies;
624                 if (set_char != set + 'A')
625                         continue;
626                 sprintf(buf, "%d:%d", disk.major, disk.minor);
627                 dv = add_one(dv, buf, dv->disposition);
628         }
629 }
630
631 int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
632                    struct supertype *dev_st, struct supertype *tst,
633                    unsigned long rdev,
634                    char *update, char *devname, int verbose,
635                    mdu_array_info_t *array)
636 {
637         struct mdinfo mdi;
638         int duuid[4];
639         int ouuid[4];
640
641         dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
642         dev_st->ss->uuid_from_super(dev_st, ouuid);
643         if (tst->sb)
644                 tst->ss->uuid_from_super(tst, duuid);
645         else
646                 /* Assume uuid matches: kernel will check */
647                 memcpy(duuid, ouuid, sizeof(ouuid));
648         if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
649             !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
650             memcmp(duuid, ouuid, sizeof(ouuid))==0) {
651                 /* Looks like it is worth a
652                  * try.  Need to make sure
653                  * kernel will accept it
654                  * though.
655                  */
656                 mdu_disk_info_t disc;
657                 /* re-add doesn't work for version-1 superblocks
658                  * before 2.6.18 :-(
659                  */
660                 if (array->major_version == 1 &&
661                     get_linux_version() <= 2006018)
662                         goto skip_re_add;
663                 disc.number = mdi.disk.number;
664                 if (ioctl(fd, GET_DISK_INFO, &disc) != 0
665                     || disc.major != 0 || disc.minor != 0
666                         )
667                         goto skip_re_add;
668                 disc.major = major(rdev);
669                 disc.minor = minor(rdev);
670                 disc.number = mdi.disk.number;
671                 disc.raid_disk = mdi.disk.raid_disk;
672                 disc.state = mdi.disk.state;
673                 if (array->state & (1 << MD_SB_CLUSTERED)) {
674                         /* extra flags are needed when adding to a cluster as
675                          * there are two cases to distinguish
676                          */
677                         if (dv->disposition == 'c')
678                                 disc.state |= (1 << MD_DISK_CANDIDATE);
679                         else
680                                 disc.state |= (1 << MD_DISK_CLUSTER_ADD);
681                 }
682                 if (dv->writemostly == FlagSet)
683                         disc.state |= 1 << MD_DISK_WRITEMOSTLY;
684                 if (dv->writemostly == FlagClear)
685                         disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
686                 if (dv->failfast == FlagSet)
687                         disc.state |= 1 << MD_DISK_FAILFAST;
688                 if (dv->failfast == FlagClear)
689                         disc.state &= ~(1 << MD_DISK_FAILFAST);
690                 remove_partitions(tfd);
691                 if (update || dv->writemostly != FlagDefault
692                         || dv->failfast != FlagDefault) {
693                         int rv = -1;
694                         tfd = dev_open(dv->devname, O_RDWR);
695                         if (tfd < 0) {
696                                 pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
697                                 return -1;
698                         }
699
700                         if (dv->writemostly == FlagSet)
701                                 rv = dev_st->ss->update_super(
702                                         dev_st, NULL, "writemostly",
703                                         devname, verbose, 0, NULL);
704                         if (dv->writemostly == FlagClear)
705                                 rv = dev_st->ss->update_super(
706                                         dev_st, NULL, "readwrite",
707                                         devname, verbose, 0, NULL);
708                         if (dv->failfast == FlagSet)
709                                 rv = dev_st->ss->update_super(
710                                         dev_st, NULL, "failfast",
711                                         devname, verbose, 0, NULL);
712                         if (dv->failfast == FlagClear)
713                                 rv = dev_st->ss->update_super(
714                                         dev_st, NULL, "nofailfast",
715                                         devname, verbose, 0, NULL);
716                         if (update)
717                                 rv = dev_st->ss->update_super(
718                                         dev_st, NULL, update,
719                                         devname, verbose, 0, NULL);
720                         if (rv == 0)
721                                 rv = dev_st->ss->store_super(dev_st, tfd);
722                         close(tfd);
723                         if (rv != 0) {
724                                 pr_err("failed to update superblock during re-add\n");
725                                 return -1;
726                         }
727                 }
728                 /* don't even try if disk is marked as faulty */
729                 errno = 0;
730                 if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
731                         if (verbose >= 0)
732                                 pr_err("re-added %s\n", dv->devname);
733                         return 1;
734                 }
735                 if (errno == ENOMEM || errno == EROFS) {
736                         pr_err("add new device failed for %s: %s\n",
737                                dv->devname, strerror(errno));
738                         if (dv->disposition == 'M')
739                                 return 0;
740                         return -1;
741                 }
742         }
743 skip_re_add:
744         return 0;
745 }
746
747 int Manage_add(int fd, int tfd, struct mddev_dev *dv,
748                struct supertype *tst, mdu_array_info_t *array,
749                int force, int verbose, char *devname,
750                char *update, unsigned long rdev, unsigned long long array_size,
751                int raid_slot)
752 {
753         unsigned long long ldsize;
754         struct supertype *dev_st;
755         int j;
756         mdu_disk_info_t disc;
757
758         if (!get_dev_size(tfd, dv->devname, &ldsize)) {
759                 if (dv->disposition == 'M')
760                         return 0;
761                 else
762                         return -1;
763         }
764
765         if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
766                 /* More than 4TB is wasted on v0.90 */
767                 if (!force) {
768                         pr_err("%s is larger than %s can effectively use.\n"
769                                "       Add --force is you really want to add this device.\n",
770                                dv->devname, devname);
771                         return -1;
772                 }
773                 pr_err("%s is larger than %s can effectively use.\n"
774                        "       Adding anyway as --force was given.\n",
775                        dv->devname, devname);
776         }
777         if (!tst->ss->external &&
778             array->major_version == 0 &&
779             md_get_version(fd)%100 < 2) {
780                 if (ioctl(fd, HOT_ADD_DISK, rdev)==0) {
781                         if (verbose >= 0)
782                                 pr_err("hot added %s\n",
783                                        dv->devname);
784                         return 1;
785                 }
786
787                 pr_err("hot add failed for %s: %s\n",
788                        dv->devname, strerror(errno));
789                 return -1;
790         }
791
792         if (array->not_persistent == 0 || tst->ss->external) {
793
794                 /* need to find a sample superblock to copy, and
795                  * a spare slot to use.
796                  * For 'external' array (well, container based),
797                  * We can just load the metadata for the array->
798                  */
799                 int array_failed;
800                 if (tst->sb)
801                         /* already loaded */;
802                 else if (tst->ss->external) {
803                         tst->ss->load_container(tst, fd, NULL);
804                 } else for (j = 0; j < tst->max_devs; j++) {
805                                 char *dev;
806                                 int dfd;
807                                 disc.number = j;
808                                 if (ioctl(fd, GET_DISK_INFO, &disc))
809                                         continue;
810                                 if (disc.major==0 && disc.minor==0)
811                                         continue;
812                                 if ((disc.state & 4)==0) /* sync */
813                                         continue;
814                                 /* Looks like a good device to try */
815                                 dev = map_dev(disc.major, disc.minor, 1);
816                                 if (!dev)
817                                         continue;
818                                 dfd = dev_open(dev, O_RDONLY);
819                                 if (dfd < 0)
820                                         continue;
821                                 if (tst->ss->load_super(tst, dfd,
822                                                         NULL)) {
823                                         close(dfd);
824                                         continue;
825                                 }
826                                 close(dfd);
827                                 break;
828                         }
829                 /* FIXME this is a bad test to be using */
830                 if (!tst->sb && (dv->disposition != 'a'
831                                  && dv->disposition != 'S')) {
832                         /* we are re-adding a device to a
833                          * completely dead array - have to depend
834                          * on kernel to check
835                          */
836                 } else if (!tst->sb) {
837                         pr_err("cannot load array metadata from %s\n", devname);
838                         return -1;
839                 }
840
841                 /* Make sure device is large enough */
842                 if (dv->disposition != 'j' &&  /* skip size check for Journal */
843                     tst->sb &&
844                     tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
845                     array_size) {
846                         if (dv->disposition == 'M')
847                                 return 0;
848                         pr_err("%s not large enough to join array\n",
849                                dv->devname);
850                         return -1;
851                 }
852
853                 /* Possibly this device was recently part of
854                  * the array and was temporarily removed, and
855                  * is now being re-added.  If so, we can
856                  * simply re-add it.
857                  */
858
859                 if (array->not_persistent == 0) {
860                         dev_st = dup_super(tst);
861                         dev_st->ss->load_super(dev_st, tfd, NULL);
862                         if (dev_st->sb && dv->disposition != 'S') {
863                                 int rv;
864
865                                 rv = attempt_re_add(fd, tfd, dv, dev_st, tst,
866                                                     rdev, update, devname,
867                                                     verbose, array);
868                                 dev_st->ss->free_super(dev_st);
869                                 if (rv)
870                                         return rv;
871                         }
872                 }
873                 if (dv->disposition == 'M') {
874                         if (verbose > 0)
875                                 pr_err("--re-add for %s to %s is not possible\n",
876                                        dv->devname, devname);
877                         return 0;
878                 }
879                 if (dv->disposition == 'A') {
880                         pr_err("--re-add for %s to %s is not possible\n",
881                                dv->devname, devname);
882                         return -1;
883                 }
884                 if (array->active_disks < array->raid_disks) {
885                         char *avail = xcalloc(array->raid_disks, 1);
886                         int d;
887                         int found = 0;
888
889                         for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
890                                 disc.number = d;
891                                 if (ioctl(fd, GET_DISK_INFO, &disc))
892                                         continue;
893                                 if (disc.major == 0 && disc.minor == 0)
894                                         continue;
895                                 if (!(disc.state & (1<<MD_DISK_SYNC)))
896                                         continue;
897                                 avail[disc.raid_disk] = 1;
898                                 found++;
899                         }
900                         array_failed = !enough(array->level, array->raid_disks,
901                                                array->layout, 1, avail);
902                         free(avail);
903                 } else
904                         array_failed = 0;
905                 if (array_failed) {
906                         pr_err("%s has failed so using --add cannot work and might destroy\n",
907                                devname);
908                         pr_err("data on %s.  You should stop the array and re-assemble it.\n",
909                                dv->devname);
910                         return -1;
911                 }
912         } else {
913                 /* non-persistent. Must ensure that new drive
914                  * is at least array->size big.
915                  */
916                 if (ldsize/512 < array_size) {
917                         pr_err("%s not large enough to join array\n",
918                                dv->devname);
919                         return -1;
920                 }
921         }
922         /* committed to really trying this device now*/
923         remove_partitions(tfd);
924
925         /* in 2.6.17 and earlier, version-1 superblocks won't
926          * use the number we write, but will choose a free number.
927          * we must choose the same free number, which requires
928          * starting at 'raid_disks' and counting up
929          */
930         for (j = array->raid_disks; j < tst->max_devs; j++) {
931                 disc.number = j;
932                 if (ioctl(fd, GET_DISK_INFO, &disc))
933                         break;
934                 if (disc.major==0 && disc.minor==0)
935                         break;
936                 if (disc.state & 8) /* removed */
937                         break;
938         }
939         disc.major = major(rdev);
940         disc.minor = minor(rdev);
941         if (raid_slot < 0)
942                 disc.number = j;
943         else
944                 disc.number = raid_slot;
945         disc.state = 0;
946
947         /* only add journal to array that supports journaling */
948         if (dv->disposition == 'j') {
949                 struct mdinfo mdi;
950                 struct mdinfo *mdp;
951
952                 mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
953                 if (!mdp) {
954                         pr_err("%s unable to read array state.\n", devname);
955                         return -1;
956                 }
957
958                 if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
959                         sysfs_free(mdp);
960                         pr_err("%s is not readonly, cannot add journal.\n", devname);
961                         return -1;
962                 }
963
964                 sysfs_free(mdp);
965
966                 tst->ss->getinfo_super(tst, &mdi, NULL);
967                 if (mdi.journal_device_required == 0) {
968                         pr_err("%s does not support journal device.\n", devname);
969                         return -1;
970                 }
971                 disc.raid_disk = 0;
972         }
973
974         if (array->not_persistent==0) {
975                 int dfd;
976                 if (dv->disposition == 'j')
977                         disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
978                 if (dv->writemostly == FlagSet)
979                         disc.state |= 1 << MD_DISK_WRITEMOSTLY;
980                 if (dv->failfast == FlagSet)
981                         disc.state |= 1 << MD_DISK_FAILFAST;
982                 dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
983                 if (tst->ss->add_to_super(tst, &disc, dfd,
984                                           dv->devname, INVALID_SECTORS))
985                         return -1;
986                 if (tst->ss->write_init_super(tst))
987                         return -1;
988         } else if (dv->disposition == 'A') {
989                 /*  this had better be raid1.
990                  * As we are "--re-add"ing we must find a spare slot
991                  * to fill.
992                  */
993                 char *used = xcalloc(array->raid_disks, 1);
994                 for (j = 0; j < tst->max_devs; j++) {
995                         mdu_disk_info_t disc2;
996                         disc2.number = j;
997                         if (ioctl(fd, GET_DISK_INFO, &disc2))
998                                 continue;
999                         if (disc2.major==0 && disc2.minor==0)
1000                                 continue;
1001                         if (disc2.state & 8) /* removed */
1002                                 continue;
1003                         if (disc2.raid_disk < 0)
1004                                 continue;
1005                         if (disc2.raid_disk > array->raid_disks)
1006                                 continue;
1007                         used[disc2.raid_disk] = 1;
1008                 }
1009                 for (j = 0 ; j < array->raid_disks; j++)
1010                         if (!used[j]) {
1011                                 disc.raid_disk = j;
1012                                 disc.state |= (1<<MD_DISK_SYNC);
1013                                 break;
1014                         }
1015                 free(used);
1016         }
1017
1018         if (array->state & (1 << MD_SB_CLUSTERED)) {
1019                 if (dv->disposition == 'c')
1020                         disc.state |= (1 << MD_DISK_CANDIDATE);
1021                 else
1022                         disc.state |= (1 << MD_DISK_CLUSTER_ADD);
1023         }
1024
1025         if (dv->writemostly == FlagSet)
1026                 disc.state |= (1 << MD_DISK_WRITEMOSTLY);
1027         if (dv->failfast == FlagSet)
1028                 disc.state |= (1 << MD_DISK_FAILFAST);
1029         if (tst->ss->external) {
1030                 /* add a disk
1031                  * to an external metadata container */
1032                 struct mdinfo new_mdi;
1033                 struct mdinfo *sra;
1034                 int container_fd;
1035                 char devnm[32];
1036                 int dfd;
1037
1038                 strcpy(devnm, fd2devnm(fd));
1039
1040                 container_fd = open_dev_excl(devnm);
1041                 if (container_fd < 0) {
1042                         pr_err("add failed for %s: could not get exclusive access to container\n",
1043                                dv->devname);
1044                         tst->ss->free_super(tst);
1045                         return -1;
1046                 }
1047
1048                 Kill(dv->devname, NULL, 0, -1, 0);
1049                 dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
1050                 if (mdmon_running(tst->container_devnm))
1051                         tst->update_tail = &tst->updates;
1052                 if (tst->ss->add_to_super(tst, &disc, dfd,
1053                                           dv->devname, INVALID_SECTORS)) {
1054                         close(dfd);
1055                         close(container_fd);
1056                         return -1;
1057                 }
1058                 if (tst->update_tail)
1059                         flush_metadata_updates(tst);
1060                 else
1061                         tst->ss->sync_metadata(tst);
1062
1063                 sra = sysfs_read(container_fd, NULL, 0);
1064                 if (!sra) {
1065                         pr_err("add failed for %s: sysfs_read failed\n",
1066                                dv->devname);
1067                         close(container_fd);
1068                         tst->ss->free_super(tst);
1069                         return -1;
1070                 }
1071                 sra->array.level = LEVEL_CONTAINER;
1072                 /* Need to set data_offset and component_size */
1073                 tst->ss->getinfo_super(tst, &new_mdi, NULL);
1074                 new_mdi.disk.major = disc.major;
1075                 new_mdi.disk.minor = disc.minor;
1076                 new_mdi.recovery_start = 0;
1077                 /* Make sure fds are closed as they are O_EXCL which
1078                  * would block add_disk */
1079                 tst->ss->free_super(tst);
1080                 if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
1081                         pr_err("add new device to external metadata failed for %s\n", dv->devname);
1082                         close(container_fd);
1083                         sysfs_free(sra);
1084                         return -1;
1085                 }
1086                 ping_monitor(devnm);
1087                 sysfs_free(sra);
1088                 close(container_fd);
1089         } else {
1090                 tst->ss->free_super(tst);
1091                 if (ioctl(fd, ADD_NEW_DISK, &disc)) {
1092                         if (dv->disposition == 'j')
1093                                 pr_err("Failed to hot add %s as journal, "
1094                                        "please try restart %s.\n", dv->devname, devname);
1095                         else
1096                                 pr_err("add new device failed for %s as %d: %s\n",
1097                                        dv->devname, j, strerror(errno));
1098                         return -1;
1099                 }
1100                 if (dv->disposition == 'j') {
1101                         pr_err("Journal added successfully, making %s read-write\n", devname);
1102                         if (Manage_ro(devname, fd, -1))
1103                                 pr_err("Failed to make %s read-write\n", devname);
1104                 }
1105
1106         }
1107         if (verbose >= 0)
1108                 pr_err("added %s\n", dv->devname);
1109         return 1;
1110 }
1111
1112 int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
1113                   int sysfd, unsigned long rdev, int verbose, char *devname)
1114 {
1115         int lfd = -1;
1116         int err;
1117
1118         if (tst->ss->external) {
1119                 /* To remove a device from a container, we must
1120                  * check that it isn't in use in an array.
1121                  * This involves looking in the 'holders'
1122                  * directory - there must be just one entry,
1123                  * the container.
1124                  * To ensure that it doesn't get used as a
1125                  * hot spare while we are checking, we
1126                  * get an O_EXCL open on the container
1127                  */
1128                 int ret;
1129                 char devnm[32];
1130                 strcpy(devnm, fd2devnm(fd));
1131                 lfd = open_dev_excl(devnm);
1132                 if (lfd < 0) {
1133                         pr_err("Cannot get exclusive access  to container - odd\n");
1134                         return -1;
1135                 }
1136                 /* We may not be able to check on holders in
1137                  * sysfs, either because we don't have the dev num
1138                  * (rdev == 0) or because the device has been detached
1139                  * and the 'holders' directory no longer exists
1140                  * (ret == -1).  In that case, assume it is OK to
1141                  * remove.
1142                  */
1143                 if (rdev == 0)
1144                         ret = -1;
1145                 else {
1146                         /*
1147                          * The drive has already been set to 'faulty', however
1148                          * monitor might not have had time to process it and the
1149                          * drive might still have an entry in the 'holders'
1150                          * directory. Try a few times to avoid a false error
1151                          */
1152                         int count = 20;
1153
1154                         do {
1155                                 ret = sysfs_unique_holder(devnm, rdev);
1156                                 if (ret < 2)
1157                                         break;
1158                                 usleep(100 * 1000);     /* 100ms */
1159                         } while (--count > 0);
1160
1161                         if (ret == 0) {
1162                                 pr_err("%s is not a member, cannot remove.\n",
1163                                         dv->devname);
1164                                 close(lfd);
1165                                 return -1;
1166                         }
1167                         if (ret >= 2) {
1168                                 pr_err("%s is still in use, cannot remove.\n",
1169                                         dv->devname);
1170                                 close(lfd);
1171                                 return -1;
1172                         }
1173                 }
1174         }
1175         /* FIXME check that it is a current member */
1176         if (sysfd >= 0) {
1177                 /* device has been removed and we don't know
1178                  * the major:minor number
1179                  */
1180                 int n = write(sysfd, "remove", 6);
1181                 if (n != 6)
1182                         err = -1;
1183                 else
1184                         err = 0;
1185         } else {
1186                 err = ioctl(fd, HOT_REMOVE_DISK, rdev);
1187                 if (err && errno == ENODEV) {
1188                         /* Old kernels rejected this if no personality
1189                          * is registered */
1190                         struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
1191                         struct mdinfo *dv = NULL;
1192                         if (sra)
1193                                 dv = sra->devs;
1194                         for ( ; dv ; dv=dv->next)
1195                                 if (dv->disk.major == (int)major(rdev) &&
1196                                     dv->disk.minor == (int)minor(rdev))
1197                                         break;
1198                         if (dv)
1199                                 err = sysfs_set_str(sra, dv,
1200                                                     "state", "remove");
1201                         else
1202                                 err = -1;
1203                         sysfs_free(sra);
1204                 }
1205         }
1206         if (err) {
1207                 pr_err("hot remove failed for %s: %s\n",        dv->devname,
1208                        strerror(errno));
1209                 if (lfd >= 0)
1210                         close(lfd);
1211                 return -1;
1212         }
1213         if (tst->ss->external) {
1214                 /*
1215                  * Before dropping our exclusive open we make an
1216                  * attempt at preventing mdmon from seeing an
1217                  * 'add' event before reconciling this 'remove'
1218                  * event.
1219                  */
1220                 char *devnm = fd2devnm(fd);
1221
1222                 if (!devnm) {
1223                         pr_err("unable to get container name\n");
1224                         return -1;
1225                 }
1226
1227                 ping_manager(devnm);
1228         }
1229         if (lfd >= 0)
1230                 close(lfd);
1231         if (verbose >= 0)
1232                 pr_err("hot removed %s from %s\n",
1233                        dv->devname, devname);
1234         return 1;
1235 }
1236
1237 int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
1238                    unsigned long rdev, int verbose, char *devname)
1239 {
1240         struct mdinfo *mdi, *di;
1241         if (tst->ss->external) {
1242                 pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
1243                 return -1;
1244         }
1245         /* Need to find the device in sysfs and add 'want_replacement' to the
1246          * status.
1247          */
1248         mdi = sysfs_read(fd, NULL, GET_DEVS);
1249         if (!mdi || !mdi->devs) {
1250                 pr_err("Cannot find status of %s to enable replacement - strange\n",
1251                        devname);
1252                 return -1;
1253         }
1254         for (di = mdi->devs; di; di = di->next)
1255                 if (di->disk.major == (int)major(rdev) &&
1256                     di->disk.minor == (int)minor(rdev))
1257                         break;
1258         if (di) {
1259                 int rv;
1260                 if (di->disk.raid_disk < 0) {
1261                         pr_err("%s is not active and so cannot be replaced.\n",
1262                                dv->devname);
1263                         sysfs_free(mdi);
1264                         return -1;
1265                 }
1266                 rv = sysfs_set_str(mdi, di,
1267                                    "state", "want_replacement");
1268                 if (rv) {
1269                         sysfs_free(mdi);
1270                         pr_err("Failed to request replacement for %s\n",
1271                                dv->devname);
1272                         return -1;
1273                 }
1274                 if (verbose >= 0)
1275                         pr_err("Marked %s (device %d in %s) for replacement\n",
1276                                dv->devname, di->disk.raid_disk, devname);
1277                 /* If there is a matching 'with', we need to tell it which
1278                  * raid disk
1279                  */
1280                 while (dv && dv->disposition != 'W')
1281                         dv = dv->next;
1282                 if (dv) {
1283                         dv->disposition = 'w';
1284                         dv->used = di->disk.raid_disk;
1285                 }
1286                 return 1;
1287         }
1288         sysfs_free(mdi);
1289         pr_err("%s not found in %s so cannot --replace it\n",
1290                dv->devname, devname);
1291         return -1;
1292 }
1293
1294 int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
1295                 unsigned long rdev, int verbose, char *devname)
1296 {
1297         struct mdinfo *mdi, *di;
1298         /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
1299         mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
1300         if (!mdi || !mdi->devs) {
1301                 pr_err("Cannot find status of %s to enable replacement - strange\n",
1302                        devname);
1303                 return -1;
1304         }
1305         for (di = mdi->devs; di; di = di->next)
1306                 if (di->disk.major == (int)major(rdev) &&
1307                     di->disk.minor == (int)minor(rdev))
1308                         break;
1309         if (di) {
1310                 int rv;
1311                 if (di->disk.state & (1<<MD_DISK_FAULTY)) {
1312                         pr_err("%s is faulty and cannot be a replacement\n",
1313                                dv->devname);
1314                         sysfs_free(mdi);
1315                         return -1;
1316                 }
1317                 if (di->disk.raid_disk >= 0) {
1318                         pr_err("%s is active and cannot be a replacement\n",
1319                                dv->devname);
1320                         sysfs_free(mdi);
1321                         return -1;
1322                 }
1323                 rv = sysfs_set_num(mdi, di,
1324                                    "slot", dv->used);
1325                 if (rv) {
1326                         sysfs_free(mdi);
1327                         pr_err("Failed to set %s as preferred replacement.\n",
1328                                dv->devname);
1329                         return -1;
1330                 }
1331                 if (verbose >= 0)
1332                         pr_err("Marked %s in %s as replacement for device %d\n",
1333                                dv->devname, devname, dv->used);
1334                 return 1;
1335         }
1336         sysfs_free(mdi);
1337         pr_err("%s not found in %s so cannot make it preferred replacement\n",
1338                dv->devname, devname);
1339         return -1;
1340 }
1341
1342 int Manage_subdevs(char *devname, int fd,
1343                    struct mddev_dev *devlist, int verbose, int test,
1344                    char *update, int force)
1345 {
1346         /* Do something to each dev.
1347          * devmode can be
1348          *  'a' - add the device
1349          *         try HOT_ADD_DISK
1350          *         If that fails EINVAL, try ADD_NEW_DISK
1351          *  'S' - add the device as a spare - don't try re-add
1352          *  'j' - add the device as a journal device
1353          *  'A' - re-add the device
1354          *  'r' - remove the device: HOT_REMOVE_DISK
1355          *        device can be 'faulty' or 'detached' in which case all
1356          *        matching devices are removed.
1357          *  'f' - set the device faulty SET_DISK_FAULTY
1358          *        device can be 'detached' in which case any device that
1359          *        is inaccessible will be marked faulty.
1360          *  'R' - mark this device as wanting replacement.
1361          *  'W' - this device is added if necessary and activated as
1362          *        a replacement for a previous 'R' device.
1363          * -----
1364          *  'w' - 'W' will be changed to 'w' when it is paired with
1365          *        a 'R' device.  If a 'W' is found while walking the list
1366          *        it must be unpaired, and is an error.
1367          *  'M' - this is created by a 'missing' target.  It is a slight
1368          *        variant on 'A'
1369          *  'F' - Another variant of 'A', where the device was faulty
1370          *        so must be removed from the array first.
1371          *  'c' - confirm the device as found (for clustered environments)
1372          *
1373          * For 'f' and 'r', the device can also be a kernel-internal
1374          * name such as 'sdb'.
1375          */
1376         mdu_array_info_t array;
1377         unsigned long long array_size;
1378         struct mddev_dev *dv;
1379         int tfd = -1;
1380         struct supertype *tst;
1381         char *subarray = NULL;
1382         int sysfd = -1;
1383         int count = 0; /* number of actions taken */
1384         struct mdinfo info;
1385         struct mdinfo devinfo;
1386         int frozen = 0;
1387         int busy = 0;
1388         int raid_slot = -1;
1389
1390         if (ioctl(fd, GET_ARRAY_INFO, &array)) {
1391                 pr_err("Cannot get array info for %s\n",
1392                         devname);
1393                 goto abort;
1394         }
1395         sysfs_init(&info, fd, NULL);
1396
1397         /* array.size is only 32 bits and may be truncated.
1398          * So read from sysfs if possible, and record number of sectors
1399          */
1400
1401         array_size = get_component_size(fd);
1402         if (array_size <= 0)
1403                 array_size = array.size * 2;
1404
1405         tst = super_by_fd(fd, &subarray);
1406         if (!tst) {
1407                 pr_err("unsupport array - version %d.%d\n",
1408                         array.major_version, array.minor_version);
1409                 goto abort;
1410         }
1411
1412         for (dv = devlist; dv; dv = dv->next) {
1413                 unsigned long rdev = 0; /* device to add/remove etc */
1414                 int rv;
1415                 int mj,mn;
1416
1417                 raid_slot = -1;
1418                 if (dv->disposition == 'c') {
1419                         rv = parse_cluster_confirm_arg(dv->devname,
1420                                                        &dv->devname,
1421                                                        &raid_slot);
1422                         if (rv) {
1423                                 pr_err("Could not get the devname of cluster\n");
1424                                 goto abort;
1425                         }
1426                 }
1427
1428                 if (strcmp(dv->devname, "failed") == 0 ||
1429                     strcmp(dv->devname, "faulty") == 0) {
1430                         if (dv->disposition != 'A'
1431                             && dv->disposition != 'r') {
1432                                 pr_err("%s only meaningful with -r or --re-add, not -%c\n",
1433                                         dv->devname, dv->disposition);
1434                                 goto abort;
1435                         }
1436                         add_faulty(dv, fd, (dv->disposition == 'A'
1437                                             ? 'F' : 'r'));
1438                         continue;
1439                 }
1440                 if (strcmp(dv->devname, "detached") == 0) {
1441                         if (dv->disposition != 'r' && dv->disposition != 'f') {
1442                                 pr_err("%s only meaningful with -r of -f, not -%c\n",
1443                                         dv->devname, dv->disposition);
1444                                 goto abort;
1445                         }
1446                         add_detached(dv, fd, dv->disposition);
1447                         continue;
1448                 }
1449
1450                 if (strcmp(dv->devname, "missing") == 0) {
1451                         struct mddev_dev *add_devlist;
1452                         struct mddev_dev **dp;
1453                         if (dv->disposition == 'c') {
1454                                 rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
1455                                 break;
1456                         }
1457
1458                         if (dv->disposition != 'A') {
1459                                 pr_err("'missing' only meaningful with --re-add\n");
1460                                 goto abort;
1461                         }
1462                         add_devlist = conf_get_devs();
1463                         if (add_devlist == NULL) {
1464                                 pr_err("no devices to scan for missing members.");
1465                                 continue;
1466                         }
1467                         for (dp = &add_devlist; *dp; dp = & (*dp)->next)
1468                                 /* 'M' (for 'missing') is like 'A' without errors */
1469                                 (*dp)->disposition = 'M';
1470                         *dp = dv->next;
1471                         dv->next = add_devlist;
1472                         continue;
1473                 }
1474
1475                 if (strncmp(dv->devname, "set-", 4) == 0 &&
1476                     strlen(dv->devname) == 5) {
1477                         int copies;
1478
1479                         if (dv->disposition != 'r' &&
1480                             dv->disposition != 'f') {
1481                                 pr_err("'%s' only meaningful with -r or -f\n",
1482                                        dv->devname);
1483                                 goto abort;
1484                         }
1485                         if (array.level != 10) {
1486                                 pr_err("'%s' only meaningful with RAID10 arrays\n",
1487                                        dv->devname);
1488                                 goto abort;
1489                         }
1490                         copies = ((array.layout & 0xff) *
1491                                   ((array.layout >> 8) & 0xff));
1492                         if (array.raid_disks % copies != 0 ||
1493                             dv->devname[4] < 'A' ||
1494                             dv->devname[4] >= 'A' + copies ||
1495                             copies > 26) {
1496                                 pr_err("'%s' not meaningful with this array\n",
1497                                        dv->devname);
1498                                 goto abort;
1499                         }
1500                         add_set(dv, fd, dv->devname[4]);
1501                         continue;
1502                 }
1503
1504                 if (strchr(dv->devname, '/') == NULL &&
1505                     strchr(dv->devname, ':') == NULL &&
1506                     strlen(dv->devname) < 50) {
1507                         /* Assume this is a kernel-internal name like 'sda1' */
1508                         int found = 0;
1509                         char dname[55];
1510                         if (dv->disposition != 'r' && dv->disposition != 'f') {
1511                                 pr_err("%s only meaningful with -r or -f, not -%c\n",
1512                                         dv->devname, dv->disposition);
1513                                 goto abort;
1514                         }
1515
1516                         sprintf(dname, "dev-%s", dv->devname);
1517                         sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
1518                         if (sysfd >= 0) {
1519                                 char dn[20];
1520                                 if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
1521                                     sscanf(dn, "%d:%d", &mj,&mn) == 2) {
1522                                         rdev = makedev(mj,mn);
1523                                         found = 1;
1524                                 }
1525                                 close(sysfd);
1526                                 sysfd = -1;
1527                         }
1528                         if (!found) {
1529                                 sysfd = sysfs_open(fd2devnm(fd), dname, "state");
1530                                 if (sysfd < 0) {
1531                                         pr_err("%s does not appear to be a component of %s\n",
1532                                                 dv->devname, devname);
1533                                         goto abort;
1534                                 }
1535                         }
1536                 } else if ((dv->disposition == 'r' || dv->disposition == 'f')
1537                            && get_maj_min(dv->devname, &mj, &mn)) {
1538                         /* for 'fail' and 'remove', the device might
1539                          * not exist.
1540                          */
1541                         rdev = makedev(mj, mn);
1542                 } else {
1543                         struct stat stb;
1544                         tfd = dev_open(dv->devname, O_RDONLY);
1545                         if (tfd >= 0) {
1546                                 fstat(tfd, &stb);
1547                                 close(tfd);
1548                         } else {
1549                                 int open_err = errno;
1550                                 if (stat(dv->devname, &stb) != 0) {
1551                                         pr_err("Cannot find %s: %s\n",
1552                                                dv->devname, strerror(errno));
1553                                         goto abort;
1554                                 }
1555                                 if ((stb.st_mode & S_IFMT) != S_IFBLK) {
1556                                         if (dv->disposition == 'M')
1557                                                 /* non-fatal. Also improbable */
1558                                                 continue;
1559                                         pr_err("%s is not a block device.\n",
1560                                                dv->devname);
1561                                         goto abort;
1562                                 }
1563                                 if (dv->disposition == 'r')
1564                                         /* Be happy, the stat worked, that is
1565                                          * enough for --remove
1566                                          */
1567                                         ;
1568                                 else {
1569                                         if (dv->disposition == 'M')
1570                                                 /* non-fatal */
1571                                                 continue;
1572                                         pr_err("Cannot open %s: %s\n",
1573                                                dv->devname, strerror(open_err));
1574                                         goto abort;
1575                                 }
1576                         }
1577                         rdev = stb.st_rdev;
1578                 }
1579                 switch(dv->disposition){
1580                 default:
1581                         pr_err("internal error - devmode[%s]=%d\n",
1582                                 dv->devname, dv->disposition);
1583                         goto abort;
1584                 case 'a':
1585                 case 'S': /* --add-spare */
1586                 case 'j': /* --add-journal */
1587                 case 'A':
1588                 case 'M': /* --re-add missing */
1589                 case 'F': /* --re-add faulty  */
1590                 case 'c': /* --cluster-confirm */
1591                         /* add the device */
1592                         if (subarray) {
1593                                 pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
1594                                 goto abort;
1595                         }
1596
1597                         /* Let's first try to write re-add to sysfs */
1598                         if (rdev != 0 &&
1599                             (dv->disposition == 'A' || dv->disposition == 'F')) {
1600                                 sysfs_init_dev(&devinfo, rdev);
1601                                 if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
1602                                         pr_err("re-add %s to %s succeed\n",
1603                                                 dv->devname, info.sys_name);
1604                                         break;
1605                                 }
1606                         }
1607
1608                         if (dv->disposition == 'F')
1609                                 /* Need to remove first */
1610                                 ioctl(fd, HOT_REMOVE_DISK, rdev);
1611                         /* Make sure it isn't in use (in 2.6 or later) */
1612                         tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
1613                         if (tfd >= 0) {
1614                                 /* We know no-one else is using it.  We'll
1615                                  * need non-exclusive access to add it, so
1616                                  * do that now.
1617                                  */
1618                                 close(tfd);
1619                                 tfd = dev_open(dv->devname, O_RDONLY);
1620                         }
1621                         if (tfd < 0) {
1622                                 if (dv->disposition == 'M')
1623                                         continue;
1624                                 pr_err("Cannot open %s: %s\n",
1625                                         dv->devname, strerror(errno));
1626                                 goto abort;
1627                         }
1628                         if (!frozen) {
1629                                 if (sysfs_freeze_array(&info) == 1)
1630                                         frozen = 1;
1631                                 else
1632                                         frozen = -1;
1633                         }
1634                         rv = Manage_add(fd, tfd, dv, tst, &array,
1635                                         force, verbose, devname, update,
1636                                         rdev, array_size, raid_slot);
1637                         close(tfd);
1638                         tfd = -1;
1639                         if (rv < 0)
1640                                 goto abort;
1641                         if (rv > 0)
1642                                 count++;
1643                         break;
1644
1645                 case 'r':
1646                         /* hot remove */
1647                         if (subarray) {
1648                                 pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
1649                                 rv = -1;
1650                         } else
1651                                 rv = Manage_remove(tst, fd, dv, sysfd,
1652                                                    rdev, verbose,
1653                                                    devname);
1654                         if (sysfd >= 0)
1655                                 close(sysfd);
1656                         sysfd = -1;
1657                         if (rv < 0)
1658                                 goto abort;
1659                         if (rv > 0)
1660                                 count++;
1661                         break;
1662
1663                 case 'f': /* set faulty */
1664                         /* FIXME check current member */
1665                         if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
1666                             (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
1667                                                 rdev))) {
1668                                 if (errno == EBUSY)
1669                                         busy = 1;
1670                                 pr_err("set device faulty failed for %s:  %s\n",
1671                                         dv->devname, strerror(errno));
1672                                 if (sysfd >= 0)
1673                                         close(sysfd);
1674                                 goto abort;
1675                         }
1676                         if (sysfd >= 0)
1677                                 close(sysfd);
1678                         sysfd = -1;
1679                         count++;
1680                         if (verbose >= 0)
1681                                 pr_err("set %s faulty in %s\n",
1682                                         dv->devname, devname);
1683                         break;
1684                 case 'R': /* Mark as replaceable */
1685                         if (subarray) {
1686                                 pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
1687                                 rv = -1;
1688                         } else {
1689                                 if (!frozen) {
1690                                         if (sysfs_freeze_array(&info) == 1)
1691                                                 frozen = 1;
1692                                         else
1693                                                 frozen = -1;
1694                                 }
1695                                 rv = Manage_replace(tst, fd, dv,
1696                                                     rdev, verbose,
1697                                                     devname);
1698                         }
1699                         if (rv < 0)
1700                                 goto abort;
1701                         if (rv > 0)
1702                                 count++;
1703                         break;
1704                 case 'W': /* --with device that doesn't match */
1705                         pr_err("No matching --replace device for --with %s\n",
1706                                dv->devname);
1707                         goto abort;
1708                 case 'w': /* --with device which was matched */
1709                         rv = Manage_with(tst, fd, dv,
1710                                          rdev, verbose, devname);
1711                         if (rv < 0)
1712                                 goto abort;
1713                         break;
1714                 }
1715         }
1716         if (frozen > 0)
1717                 sysfs_set_str(&info, NULL, "sync_action","idle");
1718         if (test && count == 0)
1719                 return 2;
1720         return 0;
1721
1722 abort:
1723         if (frozen > 0)
1724                 sysfs_set_str(&info, NULL, "sync_action","idle");
1725         return !test && busy ? 2 : 1;
1726 }
1727
1728 int autodetect(void)
1729 {
1730         /* Open any md device, and issue the RAID_AUTORUN ioctl */
1731         int rv = 1;
1732         int fd = dev_open("9:0", O_RDONLY);
1733         if (fd >= 0) {
1734                 if (ioctl(fd, RAID_AUTORUN, 0) == 0)
1735                         rv = 0;
1736                 close(fd);
1737         }
1738         return rv;
1739 }
1740
1741 int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose)
1742 {
1743         struct supertype supertype, *st = &supertype;
1744         int fd, rv = 2;
1745
1746         memset(st, 0, sizeof(*st));
1747
1748         fd = open_subarray(dev, subarray, st, verbose < 0);
1749         if (fd < 0)
1750                 return 2;
1751
1752         if (!st->ss->update_subarray) {
1753                 if (verbose >= 0)
1754                         pr_err("Operation not supported for %s metadata\n",
1755                                st->ss->name);
1756                 goto free_super;
1757         }
1758
1759         if (mdmon_running(st->devnm))
1760                 st->update_tail = &st->updates;
1761
1762         rv = st->ss->update_subarray(st, subarray, update, ident);
1763
1764         if (rv) {
1765                 if (verbose >= 0)
1766                         pr_err("Failed to update %s of subarray-%s in %s\n",
1767                                 update, subarray, dev);
1768         } else if (st->update_tail)
1769                 flush_metadata_updates(st);
1770         else
1771                 st->ss->sync_metadata(st);
1772
1773         if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0)
1774                 pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
1775                        subarray, dev);
1776
1777  free_super:
1778         st->ss->free_super(st);
1779         close(fd);
1780
1781         return rv;
1782 }
1783
1784 /* Move spare from one array to another If adding to destination array fails
1785  * add back to original array.
1786  * Returns 1 on success, 0 on failure */
1787 int move_spare(char *from_devname, char *to_devname, dev_t devid)
1788 {
1789         struct mddev_dev devlist;
1790         char devname[20];
1791
1792         /* try to remove and add */
1793         int fd1 = open(to_devname, O_RDONLY);
1794         int fd2 = open(from_devname, O_RDONLY);
1795
1796         if (fd1 < 0 || fd2 < 0) {
1797                 if (fd1>=0) close(fd1);
1798                 if (fd2>=0) close(fd2);
1799                 return 0;
1800         }
1801
1802         devlist.next = NULL;
1803         devlist.used = 0;
1804         devlist.writemostly = FlagDefault;
1805         devlist.failfast = FlagDefault;
1806         devlist.devname = devname;
1807         sprintf(devname, "%d:%d", major(devid), minor(devid));
1808
1809         devlist.disposition = 'r';
1810         if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) {
1811                 devlist.disposition = 'a';
1812                 if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, NULL, 0) == 0) {
1813                         /* make sure manager is aware of changes */
1814                         ping_manager(to_devname);
1815                         ping_manager(from_devname);
1816                         close(fd1);
1817                         close(fd2);
1818                         return 1;
1819                 }
1820                 else Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0);
1821         }
1822         close(fd1);
1823         close(fd2);
1824         return 0;
1825 }
1826 #endif