]> git.neil.brown.name Git - wiggle.git/blob - tests/linux/md-resync/orig
Wiggle 0.6 - first release
[wiggle.git] / tests / linux / md-resync / orig
1 /*
2  * raid1.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * RAID-1 management functions.
9  *
10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11  *
12  * Fixes to reconstruction by Jakob Ã˜stergaard" <jakob@ostenfeld.dk>
13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2, or (at your option)
18  * any later version.
19  *
20  * You should have received a copy of the GNU General Public License
21  * (for example /usr/src/linux/COPYING); if not, write to the Free
22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24
25 #include <linux/module.h>
26 #include <linux/config.h>
27 #include <linux/slab.h>
28 #include <linux/raid/raid1.h>
29 #include <asm/atomic.h>
30
31 #define MAJOR_NR MD_MAJOR
32 #define MD_DRIVER
33 #define MD_PERSONALITY
34
35 #define MAX_WORK_PER_DISK 128
36
37 #define NR_RESERVED_BUFS        32
38
39
40 /*
41  * The following can be used to debug the driver
42  */
43 #define RAID1_DEBUG     0
44
45 #if RAID1_DEBUG
46 #define PRINTK(x...)   printk(x)
47 #define inline
48 #define __inline__
49 #else
50 #define PRINTK(x...)  do { } while (0)
51 #endif
52
53
54 static mdk_personality_t raid1_personality;
55 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
58 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59 {
60         /* return a linked list of "cnt" struct buffer_heads.
61          * don't take any off the free list unless we know we can
62          * get all we need, otherwise we could deadlock
63          */
64         struct buffer_head *bh=NULL;
65
66         while(cnt) {
67                 struct buffer_head *t;
68                 md_spin_lock_irq(&conf->device_lock);
69                 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70                         while (cnt) {
71                                 t = conf->freebh;
72                                 conf->freebh = t->b_next;
73                                 t->b_next = bh;
74                                 bh = t;
75                                 t->b_state = 0;
76                                 conf->freebh_cnt--;
77                                 cnt--;
78                         }
79                 md_spin_unlock_irq(&conf->device_lock);
80                 if (cnt == 0)
81                         break;
82                 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83                 if (t) {
84                         t->b_next = bh;
85                         bh = t;
86                         cnt--;
87                 } else {
88                         PRINTK("raid1: waiting for %d bh\n", cnt);
89                         conf->freebh_blocked = 1;
90                         wait_disk_event(conf->wait_buffer,
91                                         !conf->freebh_blocked ||
92                                         conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93                         conf->freebh_blocked = 0;
94                 }
95         }
96         return bh;
97 }
98
99 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100 {
101         unsigned long flags;
102         spin_lock_irqsave(&conf->device_lock, flags);
103         while (bh) {
104                 struct buffer_head *t = bh;
105                 bh=bh->b_next;
106                 if (t->b_pprev == NULL)
107                         kmem_cache_free(bh_cachep, t);
108                 else {
109                         t->b_next= conf->freebh;
110                         conf->freebh = t;
111                         conf->freebh_cnt++;
112                 }
113         }
114         spin_unlock_irqrestore(&conf->device_lock, flags);
115         wake_up(&conf->wait_buffer);
116 }
117
118 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119 {
120         /* allocate cnt buffer_heads, possibly less if kmalloc fails */
121         int i = 0;
122
123         while (i < cnt) {
124                 struct buffer_head *bh;
125                 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126                 if (!bh) break;
127
128                 md_spin_lock_irq(&conf->device_lock);
129                 bh->b_pprev = &conf->freebh;
130                 bh->b_next = conf->freebh;
131                 conf->freebh = bh;
132                 conf->freebh_cnt++;
133                 md_spin_unlock_irq(&conf->device_lock);
134
135                 i++;
136         }
137         return i;
138 }
139
140 static void raid1_shrink_bh(raid1_conf_t *conf)
141 {
142         /* discard all buffer_heads */
143
144         md_spin_lock_irq(&conf->device_lock);
145         while (conf->freebh) {
146                 struct buffer_head *bh = conf->freebh;
147                 conf->freebh = bh->b_next;
148                 kmem_cache_free(bh_cachep, bh);
149                 conf->freebh_cnt--;
150         }
151         md_spin_unlock_irq(&conf->device_lock);
152 }
153                 
154
155 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156 {
157         struct raid1_bh *r1_bh = NULL;
158
159         do {
160                 md_spin_lock_irq(&conf->device_lock);
161                 if (!conf->freer1_blocked && conf->freer1) {
162                         r1_bh = conf->freer1;
163                         conf->freer1 = r1_bh->next_r1;
164                         conf->freer1_cnt--;
165                         r1_bh->next_r1 = NULL;
166                         r1_bh->state = (1 << R1BH_PreAlloc);
167                         r1_bh->bh_req.b_state = 0;
168                 }
169                 md_spin_unlock_irq(&conf->device_lock);
170                 if (r1_bh)
171                         return r1_bh;
172                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173                 if (r1_bh) {
174                         memset(r1_bh, 0, sizeof(*r1_bh));
175                         return r1_bh;
176                 }
177                 conf->freer1_blocked = 1;
178                 wait_disk_event(conf->wait_buffer,
179                                 !conf->freer1_blocked ||
180                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
181                         );
182                 conf->freer1_blocked = 0;
183         } while (1);
184 }
185
186 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187 {
188         struct buffer_head *bh = r1_bh->mirror_bh_list;
189         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
191         r1_bh->mirror_bh_list = NULL;
192
193         if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194                 unsigned long flags;
195                 spin_lock_irqsave(&conf->device_lock, flags);
196                 r1_bh->next_r1 = conf->freer1;
197                 conf->freer1 = r1_bh;
198                 conf->freer1_cnt++;
199                 spin_unlock_irqrestore(&conf->device_lock, flags);
200                 /* don't need to wakeup wait_buffer because
201                  *  raid1_free_bh below will do that
202                  */
203         } else {
204                 kfree(r1_bh);
205         }
206         raid1_free_bh(conf, bh);
207 }
208
209 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210 {
211         int i = 0;
212
213         while (i < cnt) {
214                 struct raid1_bh *r1_bh;
215                 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216                 if (!r1_bh)
217                         break;
218                 memset(r1_bh, 0, sizeof(*r1_bh));
219                 set_bit(R1BH_PreAlloc, &r1_bh->state);
220                 r1_bh->mddev = conf->mddev;
221
222                 raid1_free_r1bh(r1_bh);
223                 i++;
224         }
225         return i;
226 }
227
228 static void raid1_shrink_r1bh(raid1_conf_t *conf)
229 {
230         md_spin_lock_irq(&conf->device_lock);
231         while (conf->freer1) {
232                 struct raid1_bh *r1_bh = conf->freer1;
233                 conf->freer1 = r1_bh->next_r1;
234                 conf->freer1_cnt--;
235                 kfree(r1_bh);
236         }
237         md_spin_unlock_irq(&conf->device_lock);
238 }
239
240
241
242 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243 {
244         unsigned long flags;
245         struct buffer_head *bh = r1_bh->mirror_bh_list;
246         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247         r1_bh->mirror_bh_list = NULL;
248         
249         spin_lock_irqsave(&conf->device_lock, flags);
250         r1_bh->next_r1 = conf->freebuf;
251         conf->freebuf = r1_bh;
252         spin_unlock_irqrestore(&conf->device_lock, flags);
253         raid1_free_bh(conf, bh);
254 }
255
256 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257 {
258         struct raid1_bh *r1_bh;
259
260         md_spin_lock_irq(&conf->device_lock);
261         wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262         r1_bh = conf->freebuf;
263         conf->freebuf = r1_bh->next_r1;
264         r1_bh->next_r1= NULL;
265         md_spin_unlock_irq(&conf->device_lock);
266
267         return r1_bh;
268 }
269
270 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271 {
272         int i = 0;
273         struct raid1_bh *head = NULL, **tail;
274         tail = &head;
275
276         while (i < cnt) {
277                 struct raid1_bh *r1_bh;
278                 struct page *page;
279
280                 page = alloc_page(GFP_KERNEL);
281                 if (!page)
282                         break;
283
284                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285                 if (!r1_bh) {
286                         __free_page(page);
287                         break;
288                 }
289                 memset(r1_bh, 0, sizeof(*r1_bh));
290                 r1_bh->bh_req.b_page = page;
291                 r1_bh->bh_req.b_data = page_address(page);
292                 *tail = r1_bh;
293                 r1_bh->next_r1 = NULL;
294                 tail = & r1_bh->next_r1;
295                 i++;
296         }
297         /* this lock probably isn't needed, as at the time when
298          * we are allocating buffers, nobody else will be touching the
299          * freebuf list.  But it doesn't hurt....
300          */
301         md_spin_lock_irq(&conf->device_lock);
302         *tail = conf->freebuf;
303         conf->freebuf = head;
304         md_spin_unlock_irq(&conf->device_lock);
305         return i;
306 }
307
308 static void raid1_shrink_buffers (raid1_conf_t *conf)
309 {
310         struct raid1_bh *head;
311         md_spin_lock_irq(&conf->device_lock);
312         head = conf->freebuf;
313         conf->freebuf = NULL;
314         md_spin_unlock_irq(&conf->device_lock);
315
316         while (head) {
317                 struct raid1_bh *r1_bh = head;
318                 head = r1_bh->next_r1;
319                 __free_page(r1_bh->bh_req.b_page);
320                 kfree(r1_bh);
321         }
322 }
323
324 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325 {
326         raid1_conf_t *conf = mddev_to_conf(mddev);
327         int i, disks = MD_SB_DISKS;
328
329         /*
330          * Later we do read balancing on the read side 
331          * now we use the first available disk.
332          */
333
334         for (i = 0; i < disks; i++) {
335                 if (conf->mirrors[i].operational) {
336                         *rdev = conf->mirrors[i].dev;
337                         return (0);
338                 }
339         }
340
341         printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
342         return (-1);
343 }
344
345 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
346 {
347         unsigned long flags;
348         mddev_t *mddev = r1_bh->mddev;
349         raid1_conf_t *conf = mddev_to_conf(mddev);
350
351         md_spin_lock_irqsave(&retry_list_lock, flags);
352         if (raid1_retry_list == NULL)
353                 raid1_retry_tail = &raid1_retry_list;
354         *raid1_retry_tail = r1_bh;
355         raid1_retry_tail = &r1_bh->next_r1;
356         r1_bh->next_r1 = NULL;
357         md_spin_unlock_irqrestore(&retry_list_lock, flags);
358         md_wakeup_thread(conf->thread);
359 }
360
361
362 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
363 {
364         unsigned long flags;
365         spin_lock_irqsave(&conf->segment_lock, flags);
366         if (sector < conf->start_active)
367                 conf->cnt_done--;
368         else if (sector >= conf->start_future && conf->phase == phase)
369                 conf->cnt_future--;
370         else if (!--conf->cnt_pending)
371                 wake_up(&conf->wait_ready);
372
373         spin_unlock_irqrestore(&conf->segment_lock, flags);
374 }
375
376 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
377 {
378         unsigned long flags;
379         spin_lock_irqsave(&conf->segment_lock, flags);
380         if (sector >= conf->start_ready)
381                 --conf->cnt_ready;
382         else if (sector >= conf->start_active) {
383                 if (!--conf->cnt_active) {
384                         conf->start_active = conf->start_ready;
385                         wake_up(&conf->wait_done);
386                 }
387         }
388         spin_unlock_irqrestore(&conf->segment_lock, flags);
389 }
390
391 /*
392  * raid1_end_bh_io() is called when we have finished servicing a mirrored
393  * operation and are ready to return a success/failure code to the buffer
394  * cache layer.
395  */
396 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
397 {
398         struct buffer_head *bh = r1_bh->master_bh;
399
400         io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
401                         test_bit(R1BH_SyncPhase, &r1_bh->state));
402
403         bh->b_end_io(bh, uptodate);
404         raid1_free_r1bh(r1_bh);
405 }
406 void raid1_end_request (struct buffer_head *bh, int uptodate)
407 {
408         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
409
410         /*
411          * this branch is our 'one mirror IO has finished' event handler:
412          */
413         if (!uptodate)
414                 md_error (r1_bh->mddev, bh->b_dev);
415         else
416                 /*
417                  * Set R1BH_Uptodate in our master buffer_head, so that
418                  * we will return a good error code for to the higher
419                  * levels even if IO on some other mirrored buffer fails.
420                  *
421                  * The 'master' represents the complex operation to 
422                  * user-side. So if something waits for IO, then it will
423                  * wait for the 'master' buffer_head.
424                  */
425                 set_bit (R1BH_Uptodate, &r1_bh->state);
426
427         /*
428          * We split up the read and write side, imho they are 
429          * conceptually different.
430          */
431
432         if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
433                 /*
434                  * we have only one buffer_head on the read side
435                  */
436                 
437                 if (uptodate) {
438                         raid1_end_bh_io(r1_bh, uptodate);
439                         return;
440                 }
441                 /*
442                  * oops, read error:
443                  */
444                 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
445                          partition_name(bh->b_dev), bh->b_blocknr);
446                 raid1_reschedule_retry(r1_bh);
447                 return;
448         }
449
450         /*
451          * WRITE:
452          *
453          * Let's see if all mirrored write operations have finished 
454          * already.
455          */
456
457         if (atomic_dec_and_test(&r1_bh->remaining))
458                 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
459 }
460
461 /*
462  * This routine returns the disk from which the requested read should
463  * be done. It bookkeeps the last read position for every disk
464  * in array and when new read requests come, the disk which last
465  * position is nearest to the request, is chosen.
466  *
467  * TODO: now if there are 2 mirrors in the same 2 devices, performance
468  * degrades dramatically because position is mirror, not device based.
469  * This should be changed to be device based. Also atomic sequential
470  * reads should be somehow balanced.
471  */
472
473 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
474 {
475         int new_disk = conf->last_used;
476         const int sectors = bh->b_size >> 9;
477         const unsigned long this_sector = bh->b_rsector;
478         int disk = new_disk;
479         unsigned long new_distance;
480         unsigned long current_distance;
481         
482         /*
483          * Check if it is sane at all to balance
484          */
485         
486         if (conf->resync_mirrors)
487                 goto rb_out;
488         
489
490         /* make sure that disk is operational */
491         while( !conf->mirrors[new_disk].operational) {
492                 if (new_disk <= 0) new_disk = conf->raid_disks;
493                 new_disk--;
494                 if (new_disk == disk) {
495                         /*
496                          * This means no working disk was found
497                          * Nothing much to do, lets not change anything
498                          * and hope for the best...
499                          */
500                         
501                         new_disk = conf->last_used;
502
503                         goto rb_out;
504                 }
505         }
506         disk = new_disk;
507         /* now disk == new_disk == starting point for search */
508         
509         /*
510          * Don't touch anything for sequential reads.
511          */
512
513         if (this_sector == conf->mirrors[new_disk].head_position)
514                 goto rb_out;
515         
516         /*
517          * If reads have been done only on a single disk
518          * for a time, lets give another disk a change.
519          * This is for kicking those idling disks so that
520          * they would find work near some hotspot.
521          */
522         
523         if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
524                 conf->sect_count = 0;
525
526 #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
527                 /* Work around a compiler bug in egcs-2.92.11 19980921 */
528                 new_disk = *(volatile int *)&new_disk;
529 #endif
530                 do {
531                         if (new_disk<=0)
532                                 new_disk = conf->raid_disks;
533                         new_disk--;
534                         if (new_disk == disk)
535                                 break;
536                 } while ((conf->mirrors[new_disk].write_only) ||
537                          (!conf->mirrors[new_disk].operational));
538
539                 goto rb_out;
540         }
541         
542         current_distance = abs(this_sector -
543                                 conf->mirrors[disk].head_position);
544         
545         /* Find the disk which is closest */
546         
547         do {
548                 if (disk <= 0)
549                         disk = conf->raid_disks;
550                 disk--;
551                 
552                 if ((conf->mirrors[disk].write_only) ||
553                                 (!conf->mirrors[disk].operational))
554                         continue;
555                 
556                 new_distance = abs(this_sector -
557                                         conf->mirrors[disk].head_position);
558                 
559                 if (new_distance < current_distance) {
560                         conf->sect_count = 0;
561                         current_distance = new_distance;
562                         new_disk = disk;
563                 }
564         } while (disk != conf->last_used);
565
566 rb_out:
567         conf->mirrors[new_disk].head_position = this_sector + sectors;
568
569         conf->last_used = new_disk;
570         conf->sect_count += sectors;
571
572         return new_disk;
573 }
574
575 static int raid1_make_request (request_queue_t *q,
576                                struct buffer_head * bh)
577 {
578         mddev_t *mddev = q->queuedata;
579         raid1_conf_t *conf = mddev_to_conf(mddev);
580         struct buffer_head *bh_req, *bhl;
581         struct raid1_bh * r1_bh;
582         int disks = MD_SB_DISKS;
583         int i, sum_bhs = 0;
584         struct mirror_info *mirror;
585
586         if (!buffer_locked(bh))
587                 BUG();
588         
589 /*
590  * make_request() can abort the operation when READA is being
591  * used and no empty request is available.
592  *
593  * Currently, just replace the command with READ/WRITE.
594  */
595         r1_bh = raid1_alloc_r1bh (conf);
596
597         spin_lock_irq(&conf->segment_lock);
598         wait_event_lock_irq(conf->wait_done,
599                         bh->b_rsector < conf->start_active ||
600                         bh->b_rsector >= conf->start_future,
601                         conf->segment_lock);
602         if (bh->b_rsector < conf->start_active) 
603                 conf->cnt_done++;
604         else {
605                 conf->cnt_future++;
606                 if (conf->phase)
607                         set_bit(R1BH_SyncPhase, &r1_bh->state);
608         }
609         spin_unlock_irq(&conf->segment_lock);
610         
611         /*
612          * i think the read and write branch should be separated completely,
613          * since we want to do read balancing on the read side for example.
614          * Alternative implementations? :) --mingo
615          */
616
617         r1_bh->master_bh = bh;
618         r1_bh->mddev = mddev;
619         r1_bh->cmd = rw;
620
621         if (rw == READ) {
622                 /*
623                  * read balancing logic:
624                  */
625                 mirror = conf->mirrors + raid1_read_balance(conf, bh);
626
627                 bh_req = &r1_bh->bh_req;
628                 memcpy(bh_req, bh, sizeof(*bh));
629                 bh_req->b_blocknr = bh->b_rsector;
630                 bh_req->b_dev = mirror->dev;
631                 bh_req->b_rdev = mirror->dev;
632         /*      bh_req->b_rsector = bh->n_rsector; */
633                 bh_req->b_end_io = raid1_end_request;
634                 bh_req->b_private = r1_bh;
635                 generic_make_request (rw, bh_req);
636                 return 0;
637         }
638
639         /*
640          * WRITE:
641          */
642
643         bhl = raid1_alloc_bh(conf, conf->raid_disks);
644         for (i = 0; i < disks; i++) {
645                 struct buffer_head *mbh;
646                 if (!conf->mirrors[i].operational) 
647                         continue;
648  
649         /*
650          * We should use a private pool (size depending on NR_REQUEST),
651          * to avoid writes filling up the memory with bhs
652          *
653          * Such pools are much faster than kmalloc anyways (so we waste
654          * almost nothing by not using the master bh when writing and
655          * win alot of cleanness) but for now we are cool enough. --mingo
656          *
657          * It's safe to sleep here, buffer heads cannot be used in a shared
658          * manner in the write branch. Look how we lock the buffer at the
659          * beginning of this function to grok the difference ;)
660          */
661                 mbh = bhl;
662                 if (mbh == NULL) {
663                         MD_BUG();
664                         break;
665                 }
666                 bhl = mbh->b_next;
667                 mbh->b_next = NULL;
668                 mbh->b_this_page = (struct buffer_head *)1;
669                 
670         /*
671          * prepare mirrored mbh (fields ordered for max mem throughput):
672          */
673                 mbh->b_blocknr    = bh->b_rsector;
674                 mbh->b_dev        = conf->mirrors[i].dev;
675                 mbh->b_rdev       = conf->mirrors[i].dev;
676                 mbh->b_rsector    = bh->b_rsector;
677                 mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
678                                                 (1<<BH_Mapped) | (1<<BH_Lock);
679
680                 atomic_set(&mbh->b_count, 1);
681                 mbh->b_size       = bh->b_size;
682                 mbh->b_page       = bh->b_page;
683                 mbh->b_data       = bh->b_data;
684                 mbh->b_list       = BUF_LOCKED;
685                 mbh->b_end_io     = raid1_end_request;
686                 mbh->b_private    = r1_bh;
687
688                 mbh->b_next = r1_bh->mirror_bh_list;
689                 r1_bh->mirror_bh_list = mbh;
690                 sum_bhs++;
691         }
692         if (bhl) raid1_free_bh(conf,bhl);
693         if (!sum_bhs) {
694                 /* Gag - all mirrors non-operational.. */
695                 raid1_end_bh_io(r1_bh, 0);
696                 return 0;
697         }
698         md_atomic_set(&r1_bh->remaining, sum_bhs);
699
700         /*
701          * We have to be a bit careful about the semaphore above, thats
702          * why we start the requests separately. Since kmalloc() could
703          * fail, sleep and make_request() can sleep too, this is the
704          * safer solution. Imagine, end_request decreasing the semaphore
705          * before we could have set it up ... We could play tricks with
706          * the semaphore (presetting it and correcting at the end if
707          * sum_bhs is not 'n' but we have to do end_request by hand if
708          * all requests finish until we had a chance to set up the
709          * semaphore correctly ... lots of races).
710          */
711         bh = r1_bh->mirror_bh_list;
712         while(bh) {
713                 struct buffer_head *bh2 = bh;
714                 bh = bh->b_next;
715                 generic_make_request(rw, bh2);
716         }
717         return (0);
718 }
719
720 static void raid1_status(struct seq_file *seq, mddev_t *mddev)
721 {
722         raid1_conf_t *conf = mddev_to_conf(mddev);
723         int i;
724         
725         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
726                                                  conf->working_disks);
727         for (i = 0; i < conf->raid_disks; i++)
728                 seq_printf(seq, "%s",
729                         conf->mirrors[i].operational ? "U" : "_");
730         seq_printf(seq, "]");
731 }
732
733 #define LAST_DISK KERN_ALERT \
734 "raid1: only one disk left and IO error.\n"
735
736 #define NO_SPARE_DISK KERN_ALERT \
737 "raid1: no spare disk left, degrading mirror level by one.\n"
738
739 #define DISK_FAILED KERN_ALERT \
740 "raid1: Disk failure on %s, disabling device. \n" \
741 "       Operation continuing on %d devices\n"
742
743 #define START_SYNCING KERN_ALERT \
744 "raid1: start syncing spare disk.\n"
745
746 #define ALREADY_SYNCING KERN_INFO \
747 "raid1: syncing already in progress.\n"
748
749 static void mark_disk_bad (mddev_t *mddev, int failed)
750 {
751         raid1_conf_t *conf = mddev_to_conf(mddev);
752         struct mirror_info *mirror = conf->mirrors+failed;
753         mdp_super_t *sb = mddev->sb;
754
755         mirror->operational = 0;
756         mark_disk_faulty(sb->disks+mirror->number);
757         mark_disk_nonsync(sb->disks+mirror->number);
758         mark_disk_inactive(sb->disks+mirror->number);
759         if (!mirror->write_only)
760                 sb->active_disks--;
761         sb->working_disks--;
762         sb->failed_disks++;
763         mddev->sb_dirty = 1;
764         md_wakeup_thread(conf->thread);
765         if (!mirror->write_only)
766                 conf->working_disks--;
767         printk (DISK_FAILED, partition_name (mirror->dev),
768                                  conf->working_disks);
769 }
770
771 static int raid1_error (mddev_t *mddev, kdev_t dev)
772 {
773         raid1_conf_t *conf = mddev_to_conf(mddev);
774         struct mirror_info * mirrors = conf->mirrors;
775         int disks = MD_SB_DISKS;
776         int i;
777
778         /* Find the drive.
779          * If it is not operational, then we have already marked it as dead
780          * else if it is the last working disks, ignore the error, let the
781          * next level up know.
782          * else mark the drive as failed
783          */
784
785         for (i = 0; i < disks; i++)
786                 if (mirrors[i].dev==dev && mirrors[i].operational)
787                         break;
788         if (i == disks)
789                 return 0;
790
791         if (i < conf->raid_disks && conf->working_disks == 1) {
792                 /* Don't fail the drive, act as though we were just a
793                  * normal single drive
794                  */
795
796                 return 1;
797         }
798         mark_disk_bad(mddev, i);
799         return 0;
800 }
801
802 #undef LAST_DISK
803 #undef NO_SPARE_DISK
804 #undef DISK_FAILED
805 #undef START_SYNCING
806
807
808 static void print_raid1_conf (raid1_conf_t *conf)
809 {
810         int i;
811         struct mirror_info *tmp;
812
813         printk("RAID1 conf printout:\n");
814         if (!conf) {
815                 printk("(conf==NULL)\n");
816                 return;
817         }
818         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
819                          conf->raid_disks, conf->nr_disks);
820
821         for (i = 0; i < MD_SB_DISKS; i++) {
822                 tmp = conf->mirrors + i;
823                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
824                         i, tmp->spare,tmp->operational,
825                         tmp->number,tmp->raid_disk,tmp->used_slot,
826                         partition_name(tmp->dev));
827         }
828 }
829
830 static void close_sync(raid1_conf_t *conf)
831 {
832         mddev_t *mddev = conf->mddev;
833         /* If reconstruction was interrupted, we need to close the "active" and "pending"
834          * holes.
835          * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
836          */
837         /* this is really needed when recovery stops too... */
838         spin_lock_irq(&conf->segment_lock);
839         conf->start_active = conf->start_pending;
840         conf->start_ready = conf->start_pending;
841         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
842         conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
843         conf->start_future = (mddev->sb->size<<1)+1;
844         conf->cnt_pending = conf->cnt_future;
845         conf->cnt_future = 0;
846         conf->phase = conf->phase ^1;
847         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
848         conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
849         conf->phase = 0;
850         conf->cnt_future = conf->cnt_done;;
851         conf->cnt_done = 0;
852         spin_unlock_irq(&conf->segment_lock);
853         wake_up(&conf->wait_done);
854 }
855
856 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
857 {
858         int err = 0;
859         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
860         raid1_conf_t *conf = mddev->private;
861         struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
862         mdp_super_t *sb = mddev->sb;
863         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
864         mdk_rdev_t *spare_rdev, *failed_rdev;
865
866         print_raid1_conf(conf);
867
868         switch (state) {
869         case DISKOP_SPARE_ACTIVE:
870         case DISKOP_SPARE_INACTIVE:
871                 /* need to wait for pending sync io before locking device */
872                 close_sync(conf);
873         }
874
875         md_spin_lock_irq(&conf->device_lock);
876         /*
877          * find the disk ...
878          */
879         switch (state) {
880
881         case DISKOP_SPARE_ACTIVE:
882
883                 /*
884                  * Find the failed disk within the RAID1 configuration ...
885                  * (this can only be in the first conf->working_disks part)
886                  */
887                 for (i = 0; i < conf->raid_disks; i++) {
888                         tmp = conf->mirrors + i;
889                         if ((!tmp->operational && !tmp->spare) ||
890                                         !tmp->used_slot) {
891                                 failed_disk = i;
892                                 break;
893                         }
894                 }
895                 /*
896                  * When we activate a spare disk we _must_ have a disk in
897                  * the lower (active) part of the array to replace. 
898                  */
899                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
900                         MD_BUG();
901                         err = 1;
902                         goto abort;
903                 }
904                 /* fall through */
905
906         case DISKOP_SPARE_WRITE:
907         case DISKOP_SPARE_INACTIVE:
908
909                 /*
910                  * Find the spare disk ... (can only be in the 'high'
911                  * area of the array)
912                  */
913                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
914                         tmp = conf->mirrors + i;
915                         if (tmp->spare && tmp->number == (*d)->number) {
916                                 spare_disk = i;
917                                 break;
918                         }
919                 }
920                 if (spare_disk == -1) {
921                         MD_BUG();
922                         err = 1;
923                         goto abort;
924                 }
925                 break;
926
927         case DISKOP_HOT_REMOVE_DISK:
928
929                 for (i = 0; i < MD_SB_DISKS; i++) {
930                         tmp = conf->mirrors + i;
931                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
932                                 if (tmp->operational) {
933                                         err = -EBUSY;
934                                         goto abort;
935                                 }
936                                 removed_disk = i;
937                                 break;
938                         }
939                 }
940                 if (removed_disk == -1) {
941                         MD_BUG();
942                         err = 1;
943                         goto abort;
944                 }
945                 break;
946
947         case DISKOP_HOT_ADD_DISK:
948
949                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
950                         tmp = conf->mirrors + i;
951                         if (!tmp->used_slot) {
952                                 added_disk = i;
953                                 break;
954                         }
955                 }
956                 if (added_disk == -1) {
957                         MD_BUG();
958                         err = 1;
959                         goto abort;
960                 }
961                 break;
962         }
963
964         switch (state) {
965         /*
966          * Switch the spare disk to write-only mode:
967          */
968         case DISKOP_SPARE_WRITE:
969                 sdisk = conf->mirrors + spare_disk;
970                 sdisk->operational = 1;
971                 sdisk->write_only = 1;
972                 break;
973         /*
974          * Deactivate a spare disk:
975          */
976         case DISKOP_SPARE_INACTIVE:
977                 if (conf->start_future > 0) {
978                         MD_BUG();
979                         err = -EBUSY;
980                         break;
981                 }
982                 sdisk = conf->mirrors + spare_disk;
983                 sdisk->operational = 0;
984                 sdisk->write_only = 0;
985                 break;
986         /*
987          * Activate (mark read-write) the (now sync) spare disk,
988          * which means we switch it's 'raid position' (->raid_disk)
989          * with the failed disk. (only the first 'conf->nr_disks'
990          * slots are used for 'real' disks and we must preserve this
991          * property)
992          */
993         case DISKOP_SPARE_ACTIVE:
994                 if (conf->start_future > 0) {
995                         MD_BUG();
996                         err = -EBUSY;
997                         break;
998                 }
999                 sdisk = conf->mirrors + spare_disk;
1000                 fdisk = conf->mirrors + failed_disk;
1001
1002                 spare_desc = &sb->disks[sdisk->number];
1003                 failed_desc = &sb->disks[fdisk->number];
1004
1005                 if (spare_desc != *d) {
1006                         MD_BUG();
1007                         err = 1;
1008                         goto abort;
1009                 }
1010
1011                 if (spare_desc->raid_disk != sdisk->raid_disk) {
1012                         MD_BUG();
1013                         err = 1;
1014                         goto abort;
1015                 }
1016                         
1017                 if (sdisk->raid_disk != spare_disk) {
1018                         MD_BUG();
1019                         err = 1;
1020                         goto abort;
1021                 }
1022
1023                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1024                         MD_BUG();
1025                         err = 1;
1026                         goto abort;
1027                 }
1028
1029                 if (fdisk->raid_disk != failed_disk) {
1030                         MD_BUG();
1031                         err = 1;
1032                         goto abort;
1033                 }
1034
1035                 /*
1036                  * do the switch finally
1037                  */
1038                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1039                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1040
1041                 /* There must be a spare_rdev, but there may not be a
1042                  * failed_rdev.  That slot might be empty...
1043                  */
1044                 spare_rdev->desc_nr = failed_desc->number;
1045                 if (failed_rdev)
1046                         failed_rdev->desc_nr = spare_desc->number;
1047                 
1048                 xchg_values(*spare_desc, *failed_desc);
1049                 xchg_values(*fdisk, *sdisk);
1050
1051                 /*
1052                  * (careful, 'failed' and 'spare' are switched from now on)
1053                  *
1054                  * we want to preserve linear numbering and we want to
1055                  * give the proper raid_disk number to the now activated
1056                  * disk. (this means we switch back these values)
1057                  */
1058         
1059                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1060                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1061                 xchg_values(spare_desc->number, failed_desc->number);
1062                 xchg_values(sdisk->number, fdisk->number);
1063
1064                 *d = failed_desc;
1065
1066                 if (sdisk->dev == MKDEV(0,0))
1067                         sdisk->used_slot = 0;
1068                 /*
1069                  * this really activates the spare.
1070                  */
1071                 fdisk->spare = 0;
1072                 fdisk->write_only = 0;
1073
1074                 /*
1075                  * if we activate a spare, we definitely replace a
1076                  * non-operational disk slot in the 'low' area of
1077                  * the disk array.
1078                  */
1079
1080                 conf->working_disks++;
1081
1082                 break;
1083
1084         case DISKOP_HOT_REMOVE_DISK:
1085                 rdisk = conf->mirrors + removed_disk;
1086
1087                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1088                         MD_BUG();       
1089                         err = 1;
1090                         goto abort;
1091                 }
1092                 rdisk->dev = MKDEV(0,0);
1093                 rdisk->used_slot = 0;
1094                 conf->nr_disks--;
1095                 break;
1096
1097         case DISKOP_HOT_ADD_DISK:
1098                 adisk = conf->mirrors + added_disk;
1099                 added_desc = *d;
1100
1101                 if (added_disk != added_desc->number) {
1102                         MD_BUG();       
1103                         err = 1;
1104                         goto abort;
1105                 }
1106
1107                 adisk->number = added_desc->number;
1108                 adisk->raid_disk = added_desc->raid_disk;
1109                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1110
1111                 adisk->operational = 0;
1112                 adisk->write_only = 0;
1113                 adisk->spare = 1;
1114                 adisk->used_slot = 1;
1115                 adisk->head_position = 0;
1116                 conf->nr_disks++;
1117
1118                 break;
1119
1120         default:
1121                 MD_BUG();       
1122                 err = 1;
1123                 goto abort;
1124         }
1125 abort:
1126         md_spin_unlock_irq(&conf->device_lock);
1127         if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1128                 /* should move to "END_REBUILD" when such exists */
1129                 raid1_shrink_buffers(conf);
1130
1131         print_raid1_conf(conf);
1132         return err;
1133 }
1134
1135
1136 #define IO_ERROR KERN_ALERT \
1137 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1138
1139 #define REDIRECT_SECTOR KERN_ERR \
1140 "raid1: %s: redirecting sector %lu to another mirror\n"
1141
1142 /*
1143  * This is a kernel thread which:
1144  *
1145  *      1.      Retries failed read operations on working mirrors.
1146  *      2.      Updates the raid superblock when problems encounter.
1147  *      3.      Performs writes following reads for array syncronising.
1148  */
1149 static void end_sync_write(struct buffer_head *bh, int uptodate);
1150 static void end_sync_read(struct buffer_head *bh, int uptodate);
1151
1152 static void raid1d (void *data)
1153 {
1154         struct raid1_bh *r1_bh;
1155         struct buffer_head *bh;
1156         unsigned long flags;
1157         raid1_conf_t *conf = data;
1158         mddev_t *mddev = conf->mddev;
1159         kdev_t dev;
1160
1161         if (mddev->sb_dirty)
1162                 md_update_sb(mddev);
1163
1164         for (;;) {
1165                 md_spin_lock_irqsave(&retry_list_lock, flags);
1166                 r1_bh = raid1_retry_list;
1167                 if (!r1_bh)
1168                         break;
1169                 raid1_retry_list = r1_bh->next_r1;
1170                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1171
1172                 mddev = r1_bh->mddev;
1173                 bh = &r1_bh->bh_req;
1174                 switch(r1_bh->cmd) {
1175                 case SPECIAL:
1176                         /* have to allocate lots of bh structures and
1177                          * schedule writes
1178                          */
1179                         if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1180                                 int i, sum_bhs = 0;
1181                                 int disks = MD_SB_DISKS;
1182                                 struct buffer_head *bhl, *mbh;
1183                                 
1184                                 conf = mddev_to_conf(mddev);
1185                                 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1186                                 for (i = 0; i < disks ; i++) {
1187                                         if (!conf->mirrors[i].operational)
1188                                                 continue;
1189                                         if (i==conf->last_used)
1190                                                 /* we read from here, no need to write */
1191                                                 continue;
1192                                         if (i < conf->raid_disks
1193                                             && !conf->resync_mirrors)
1194                                                 /* don't need to write this,
1195                                                  * we are just rebuilding */
1196                                                 continue;
1197                                         mbh = bhl;
1198                                         if (!mbh) {
1199                                                 MD_BUG();
1200                                                 break;
1201                                         }
1202                                         bhl = mbh->b_next;
1203                                         mbh->b_this_page = (struct buffer_head *)1;
1204
1205                                                 
1206                                 /*
1207                                  * prepare mirrored bh (fields ordered for max mem throughput):
1208                                  */
1209                                         mbh->b_blocknr    = bh->b_blocknr;
1210                                         mbh->b_dev        = conf->mirrors[i].dev;
1211                                         mbh->b_rdev       = conf->mirrors[i].dev;
1212                                         mbh->b_rsector    = bh->b_blocknr;
1213                                         mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1214                                                 (1<<BH_Mapped) | (1<<BH_Lock);
1215                                         atomic_set(&mbh->b_count, 1);
1216                                         mbh->b_size       = bh->b_size;
1217                                         mbh->b_page       = bh->b_page;
1218                                         mbh->b_data       = bh->b_data;
1219                                         mbh->b_list       = BUF_LOCKED;
1220                                         mbh->b_end_io     = end_sync_write;
1221                                         mbh->b_private    = r1_bh;
1222
1223                                         mbh->b_next = r1_bh->mirror_bh_list;
1224                                         r1_bh->mirror_bh_list = mbh;
1225
1226                                         sum_bhs++;
1227                                 }
1228                                 md_atomic_set(&r1_bh->remaining, sum_bhs);
1229                                 if (bhl) raid1_free_bh(conf, bhl);
1230                                 mbh = r1_bh->mirror_bh_list;
1231
1232                                 if (!sum_bhs) {
1233                                         /* nowhere to write this too... I guess we
1234                                          * must be done
1235                                          */
1236                                         sync_request_done(bh->b_blocknr, conf);
1237                                         md_done_sync(mddev, bh->b_size>>9, 0);
1238                                         raid1_free_buf(r1_bh);
1239                                 } else
1240                                 while (mbh) {
1241                                         struct buffer_head *bh1 = mbh;
1242                                         mbh = mbh->b_next;
1243                                         generic_make_request(WRITE, bh1);
1244                                         md_sync_acct(bh1->b_dev, bh1->b_size/512);
1245                                 }
1246                         } else {
1247                                 /* There is no point trying a read-for-reconstruct
1248                                  * as reconstruct is about to be aborted
1249                                  */
1250
1251                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1252                                 md_done_sync(mddev, bh->b_size>>9, 0);
1253                         }
1254
1255                         break;
1256                 case READ:
1257                 case READA:
1258                         dev = bh->b_dev;
1259                         raid1_map (mddev, &bh->b_dev);
1260                         if (bh->b_dev == dev) {
1261                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1262                                 raid1_end_bh_io(r1_bh, 0);
1263                         } else {
1264                                 printk (REDIRECT_SECTOR,
1265                                         partition_name(bh->b_dev), bh->b_blocknr);
1266                                 bh->b_rdev = bh->b_dev;
1267                                 bh->b_rsector = bh->b_blocknr;
1268                                 generic_make_request (r1_bh->cmd, bh);
1269                         }
1270                         break;
1271                 }
1272         }
1273         md_spin_unlock_irqrestore(&retry_list_lock, flags);
1274 }
1275 #undef IO_ERROR
1276 #undef REDIRECT_SECTOR
1277
1278 /*
1279  * Private kernel thread to reconstruct mirrors after an unclean
1280  * shutdown.
1281  */
1282 static void raid1syncd (void *data)
1283 {
1284         raid1_conf_t *conf = data;
1285         mddev_t *mddev = conf->mddev;
1286
1287         if (!conf->resync_mirrors)
1288                 return;
1289         if (mddev->recovery_running != 2)
1290                 return;
1291         if (!md_do_sync(mddev, NULL)) {
1292                 /*
1293                  * Only if everything went Ok.
1294                  */
1295                 conf->resync_mirrors = 0;
1296         }
1297
1298         close_sync(conf);
1299
1300 }
1301
1302 /*
1303  * perform a "sync" on one "block"
1304  *
1305  * We need to make sure that no normal I/O request - particularly write
1306  * requests - conflict with active sync requests.
1307  * This is achieved by conceptually dividing the device space into a
1308  * number of sections:
1309  *  DONE: 0 .. a-1     These blocks are in-sync
1310  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1311  *                     no normal IO requests
1312  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1313  *                     request may be happening
1314  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1315  *                     ones will be added
1316  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1317  *                     be happening, but not sync
1318  *
1319  * We keep a
1320  *   phase    which flips (0 or 1) each time d moves and
1321  * a count of:
1322  *   z =  active io requests in FUTURE since d moved - marked with
1323  *        current phase
1324  *   y =  active io requests in FUTURE before d moved, or PENDING -
1325  *        marked with previous phase
1326  *   x =  active sync requests in READY
1327  *   w =  active sync requests in ACTIVE
1328  *   v =  active io requests in DONE
1329  *
1330  * Normally, a=b=c=d=0 and z= active io requests
1331  *   or a=b=c=d=END and v= active io requests
1332  * Allowed changes to a,b,c,d:
1333  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1334  * B:  y==0 -> c=d
1335  * C:   b=c, w+=x, x=0
1336  * D:  w==0 -> a=b
1337  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1338  *
1339  * At start of sync we apply A.
1340  * When y reaches 0, we apply B then A then being sync requests
1341  * When sync point reaches c-1, we wait for y==0, and W==0, and
1342  * then apply apply B then A then D then C.
1343  * Finally, we apply E
1344  *
1345  * The sync request simply issues a "read" against a working drive
1346  * This is marked so that on completion the raid1d thread is woken to
1347  * issue suitable write requests
1348  */
1349
1350 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1351 {
1352         raid1_conf_t *conf = mddev_to_conf(mddev);
1353         struct mirror_info *mirror;
1354         struct raid1_bh *r1_bh;
1355         struct buffer_head *bh;
1356         int bsize;
1357         int disk;
1358         int block_nr;
1359         int buffs;
1360
1361         if (!sector_nr) {
1362                 /* we want enough buffers to hold twice the window of 128*/
1363                 buffs = 128 *2 / (PAGE_SIZE>>9);
1364                 buffs = raid1_grow_buffers(conf, buffs);
1365                 if (buffs < 2)
1366                         goto nomem;
1367                 conf->window = buffs*(PAGE_SIZE>>9)/2;
1368         }
1369         spin_lock_irq(&conf->segment_lock);
1370         if (!sector_nr) {
1371                 /* initialize ...*/
1372                 conf->start_active = 0;
1373                 conf->start_ready = 0;
1374                 conf->start_pending = 0;
1375                 conf->start_future = 0;
1376                 conf->phase = 0;
1377                 
1378                 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1379                 conf->cnt_done = conf->cnt_pending = 0;
1380                 if (conf->cnt_ready || conf->cnt_active)
1381                         MD_BUG();
1382         }
1383         while (sector_nr >= conf->start_pending) {
1384                 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1385                         sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1386                         conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1387                 wait_event_lock_irq(conf->wait_done,
1388                                         !conf->cnt_active,
1389                                         conf->segment_lock);
1390                 wait_event_lock_irq(conf->wait_ready,
1391                                         !conf->cnt_pending,
1392                                         conf->segment_lock);
1393                 conf->start_active = conf->start_ready;
1394                 conf->start_ready = conf->start_pending;
1395                 conf->start_pending = conf->start_future;
1396                 conf->start_future = conf->start_future+conf->window;
1397                 // Note: falling off the end is not a problem
1398                 conf->phase = conf->phase ^1;
1399                 conf->cnt_active = conf->cnt_ready;
1400                 conf->cnt_ready = 0;
1401                 conf->cnt_pending = conf->cnt_future;
1402                 conf->cnt_future = 0;
1403                 wake_up(&conf->wait_done);
1404         }
1405         conf->cnt_ready++;
1406         spin_unlock_irq(&conf->segment_lock);
1407                 
1408
1409         /* If reconstructing, and >1 working disc,
1410          * could dedicate one to rebuild and others to
1411          * service read requests ..
1412          */
1413         disk = conf->last_used;
1414         /* make sure disk is operational */
1415         while (!conf->mirrors[disk].operational) {
1416                 if (disk <= 0) disk = conf->raid_disks;
1417                 disk--;
1418                 if (disk == conf->last_used)
1419                         break;
1420         }
1421         conf->last_used = disk;
1422         
1423         mirror = conf->mirrors+conf->last_used;
1424         
1425         r1_bh = raid1_alloc_buf (conf);
1426         r1_bh->master_bh = NULL;
1427         r1_bh->mddev = mddev;
1428         r1_bh->cmd = SPECIAL;
1429         bh = &r1_bh->bh_req;
1430
1431         block_nr = sector_nr;
1432         bsize = 512;
1433         while (!(block_nr & 1) && bsize < PAGE_SIZE
1434                         && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1435                 block_nr >>= 1;
1436                 bsize <<= 1;
1437         }
1438         bh->b_size = bsize;
1439         bh->b_list = BUF_LOCKED;
1440         bh->b_dev = mirror->dev;
1441         bh->b_rdev = mirror->dev;
1442         bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1443         if (!bh->b_page)
1444                 BUG();
1445         if (!bh->b_data)
1446                 BUG();
1447         if (bh->b_data != page_address(bh->b_page))
1448                 BUG();
1449         bh->b_end_io = end_sync_read;
1450         bh->b_private = r1_bh;
1451         bh->b_blocknr = sector_nr;
1452         bh->b_rsector = sector_nr;
1453         init_waitqueue_head(&bh->b_wait);
1454
1455         generic_make_request(READ, bh);
1456         md_sync_acct(bh->b_dev, bh->b_size/512);
1457
1458         return (bsize >> 9);
1459
1460 nomem:
1461         raid1_shrink_buffers(conf);
1462         return -ENOMEM;
1463 }
1464
1465 static void end_sync_read(struct buffer_head *bh, int uptodate)
1466 {
1467         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1468
1469         /* we have read a block, now it needs to be re-written,
1470          * or re-read if the read failed.
1471          * We don't do much here, just schedule handling by raid1d
1472          */
1473         if (!uptodate)
1474                 md_error (r1_bh->mddev, bh->b_dev);
1475         else
1476                 set_bit(R1BH_Uptodate, &r1_bh->state);
1477         raid1_reschedule_retry(r1_bh);
1478 }
1479
1480 static void end_sync_write(struct buffer_head *bh, int uptodate)
1481 {
1482         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1483         
1484         if (!uptodate)
1485                 md_error (r1_bh->mddev, bh->b_dev);
1486         if (atomic_dec_and_test(&r1_bh->remaining)) {
1487                 mddev_t *mddev = r1_bh->mddev;
1488                 unsigned long sect = bh->b_blocknr;
1489                 int size = bh->b_size;
1490                 raid1_free_buf(r1_bh);
1491                 sync_request_done(sect, mddev_to_conf(mddev));
1492                 md_done_sync(mddev,size>>9, uptodate);
1493         }
1494 }
1495
1496 #define INVALID_LEVEL KERN_WARNING \
1497 "raid1: md%d: raid level not set to mirroring (%d)\n"
1498
1499 #define NO_SB KERN_ERR \
1500 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1501
1502 #define ERRORS KERN_ERR \
1503 "raid1: disabled mirror %s (errors detected)\n"
1504
1505 #define NOT_IN_SYNC KERN_ERR \
1506 "raid1: disabled mirror %s (not in sync)\n"
1507
1508 #define INCONSISTENT KERN_ERR \
1509 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1510
1511 #define ALREADY_RUNNING KERN_ERR \
1512 "raid1: disabled mirror %s (mirror %d already operational)\n"
1513
1514 #define OPERATIONAL KERN_INFO \
1515 "raid1: device %s operational as mirror %d\n"
1516
1517 #define MEM_ERROR KERN_ERR \
1518 "raid1: couldn't allocate memory for md%d\n"
1519
1520 #define SPARE KERN_INFO \
1521 "raid1: spare disk %s\n"
1522
1523 #define NONE_OPERATIONAL KERN_ERR \
1524 "raid1: no operational mirrors for md%d\n"
1525
1526 #define ARRAY_IS_ACTIVE KERN_INFO \
1527 "raid1: raid set md%d active with %d out of %d mirrors\n"
1528
1529 #define THREAD_ERROR KERN_ERR \
1530 "raid1: couldn't allocate thread for md%d\n"
1531
1532 #define START_RESYNC KERN_WARNING \
1533 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1534
1535 static int raid1_run (mddev_t *mddev)
1536 {
1537         raid1_conf_t *conf;
1538         int i, j, disk_idx;
1539         struct mirror_info *disk;
1540         mdp_super_t *sb = mddev->sb;
1541         mdp_disk_t *descriptor;
1542         mdk_rdev_t *rdev;
1543         struct md_list_head *tmp;
1544         int start_recovery = 0;
1545
1546         MOD_INC_USE_COUNT;
1547
1548         if (sb->level != 1) {
1549                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1550                 goto out;
1551         }
1552         /*
1553          * copy the already verified devices into our private RAID1
1554          * bookkeeping area. [whatever we allocate in raid1_run(),
1555          * should be freed in raid1_stop()]
1556          */
1557
1558         conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1559         mddev->private = conf;
1560         if (!conf) {
1561                 printk(MEM_ERROR, mdidx(mddev));
1562                 goto out;
1563         }
1564         memset(conf, 0, sizeof(*conf));
1565
1566         ITERATE_RDEV(mddev,rdev,tmp) {
1567                 if (rdev->faulty) {
1568                         printk(ERRORS, partition_name(rdev->dev));
1569                 } else {
1570                         if (!rdev->sb) {
1571                                 MD_BUG();
1572                                 continue;
1573                         }
1574                 }
1575                 if (rdev->desc_nr == -1) {
1576                         MD_BUG();
1577                         continue;
1578                 }
1579                 descriptor = &sb->disks[rdev->desc_nr];
1580                 disk_idx = descriptor->raid_disk;
1581                 disk = conf->mirrors + disk_idx;
1582
1583                 if (disk_faulty(descriptor)) {
1584                         disk->number = descriptor->number;
1585                         disk->raid_disk = disk_idx;
1586                         disk->dev = rdev->dev;
1587                         disk->sect_limit = MAX_WORK_PER_DISK;
1588                         disk->operational = 0;
1589                         disk->write_only = 0;
1590                         disk->spare = 0;
1591                         disk->used_slot = 1;
1592                         disk->head_position = 0;
1593                         continue;
1594                 }
1595                 if (disk_active(descriptor)) {
1596                         if (!disk_sync(descriptor)) {
1597                                 printk(NOT_IN_SYNC,
1598                                         partition_name(rdev->dev));
1599                                 continue;
1600                         }
1601                         if ((descriptor->number > MD_SB_DISKS) ||
1602                                          (disk_idx > sb->raid_disks)) {
1603
1604                                 printk(INCONSISTENT,
1605                                         partition_name(rdev->dev));
1606                                 continue;
1607                         }
1608                         if (disk->operational) {
1609                                 printk(ALREADY_RUNNING,
1610                                         partition_name(rdev->dev),
1611                                         disk_idx);
1612                                 continue;
1613                         }
1614                         printk(OPERATIONAL, partition_name(rdev->dev),
1615                                         disk_idx);
1616                         disk->number = descriptor->number;
1617                         disk->raid_disk = disk_idx;
1618                         disk->dev = rdev->dev;
1619                         disk->sect_limit = MAX_WORK_PER_DISK;
1620                         disk->operational = 1;
1621                         disk->write_only = 0;
1622                         disk->spare = 0;
1623                         disk->used_slot = 1;
1624                         disk->head_position = 0;
1625                         conf->working_disks++;
1626                 } else {
1627                 /*
1628                  * Must be a spare disk ..
1629                  */
1630                         printk(SPARE, partition_name(rdev->dev));
1631                         disk->number = descriptor->number;
1632                         disk->raid_disk = disk_idx;
1633                         disk->dev = rdev->dev;
1634                         disk->sect_limit = MAX_WORK_PER_DISK;
1635                         disk->operational = 0;
1636                         disk->write_only = 0;
1637                         disk->spare = 1;
1638                         disk->used_slot = 1;
1639                         disk->head_position = 0;
1640                 }
1641         }
1642         conf->raid_disks = sb->raid_disks;
1643         conf->nr_disks = sb->nr_disks;
1644         conf->mddev = mddev;
1645         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1646
1647         conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1648         init_waitqueue_head(&conf->wait_buffer);
1649         init_waitqueue_head(&conf->wait_done);
1650         init_waitqueue_head(&conf->wait_ready);
1651
1652         if (!conf->working_disks) {
1653                 printk(NONE_OPERATIONAL, mdidx(mddev));
1654                 goto out_free_conf;
1655         }
1656
1657
1658         /* pre-allocate some buffer_head structures.
1659          * As a minimum, 1 r1bh and raid_disks buffer_heads
1660          * would probably get us by in tight memory situations,
1661          * but a few more is probably a good idea.
1662          * For now, try NR_RESERVED_BUFS r1bh and
1663          * NR_RESERVED_BUFS*raid_disks bufferheads
1664          * This will allow at least NR_RESERVED_BUFS concurrent
1665          * reads or writes even if kmalloc starts failing
1666          */
1667         if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1668             raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1669                               < NR_RESERVED_BUFS*conf->raid_disks) {
1670                 printk(MEM_ERROR, mdidx(mddev));
1671                 goto out_free_conf;
1672         }
1673
1674         for (i = 0; i < MD_SB_DISKS; i++) {
1675                 
1676                 descriptor = sb->disks+i;
1677                 disk_idx = descriptor->raid_disk;
1678                 disk = conf->mirrors + disk_idx;
1679
1680                 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1681                                 !disk->used_slot) {
1682
1683                         disk->number = descriptor->number;
1684                         disk->raid_disk = disk_idx;
1685                         disk->dev = MKDEV(0,0);
1686
1687                         disk->operational = 0;
1688                         disk->write_only = 0;
1689                         disk->spare = 0;
1690                         disk->used_slot = 1;
1691                         disk->head_position = 0;
1692                 }
1693         }
1694
1695         /*
1696          * find the first working one and use it as a starting point
1697          * to read balancing.
1698          */
1699         for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1700                 /* nothing */;
1701         conf->last_used = j;
1702
1703
1704
1705         {
1706                 const char * name = "raid1d";
1707
1708                 conf->thread = md_register_thread(raid1d, conf, name);
1709                 if (!conf->thread) {
1710                         printk(THREAD_ERROR, mdidx(mddev));
1711                         goto out_free_conf;
1712                 }
1713         }
1714
1715         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1716             (conf->working_disks > 1)) {
1717                 const char * name = "raid1syncd";
1718
1719                 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1720                 if (!conf->resync_thread) {
1721                         printk(THREAD_ERROR, mdidx(mddev));
1722                         goto out_free_conf;
1723                 }
1724
1725                 printk(START_RESYNC, mdidx(mddev));
1726                 conf->resync_mirrors = 1;
1727                 mddev->recovery_running = 2;
1728                 md_wakeup_thread(conf->resync_thread);
1729         }
1730
1731         /*
1732          * Regenerate the "device is in sync with the raid set" bit for
1733          * each device.
1734          */
1735         for (i = 0; i < MD_SB_DISKS; i++) {
1736                 mark_disk_nonsync(sb->disks+i);
1737                 for (j = 0; j < sb->raid_disks; j++) {
1738                         if (!conf->mirrors[j].operational)
1739                                 continue;
1740                         if (sb->disks[i].number == conf->mirrors[j].number)
1741                                 mark_disk_sync(sb->disks+i);
1742                 }
1743         }
1744         sb->active_disks = conf->working_disks;
1745
1746         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1747         /*
1748          * Ok, everything is just fine now
1749          */
1750         return 0;
1751
1752 out_free_conf:
1753         raid1_shrink_r1bh(conf);
1754         raid1_shrink_bh(conf);
1755         raid1_shrink_buffers(conf);
1756         kfree(conf);
1757         mddev->private = NULL;
1758 out:
1759         MOD_DEC_USE_COUNT;
1760         return -EIO;
1761 }
1762
1763 #undef INVALID_LEVEL
1764 #undef NO_SB
1765 #undef ERRORS
1766 #undef NOT_IN_SYNC
1767 #undef INCONSISTENT
1768 #undef ALREADY_RUNNING
1769 #undef OPERATIONAL
1770 #undef SPARE
1771 #undef NONE_OPERATIONAL
1772 #undef ARRAY_IS_ACTIVE
1773
1774 static int raid1_stop_resync (mddev_t *mddev)
1775 {
1776         raid1_conf_t *conf = mddev_to_conf(mddev);
1777
1778         if (conf->resync_thread) {
1779                 if (conf->resync_mirrors) {
1780                         md_interrupt_thread(conf->resync_thread);
1781
1782                         printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1783                         return 1;
1784                 }
1785                 return 0;
1786         }
1787         return 0;
1788 }
1789
1790 static int raid1_restart_resync (mddev_t *mddev)
1791 {
1792         raid1_conf_t *conf = mddev_to_conf(mddev);
1793
1794         if (conf->resync_mirrors) {
1795                 if (!conf->resync_thread) {
1796                         MD_BUG();
1797                         return 0;
1798                 }
1799                 mddev->recovery_running = 2;
1800                 md_wakeup_thread(conf->resync_thread);
1801                 return 1;
1802         }
1803         return 0;
1804 }
1805
1806 static int raid1_stop (mddev_t *mddev)
1807 {
1808         raid1_conf_t *conf = mddev_to_conf(mddev);
1809
1810         md_unregister_thread(conf->thread);
1811         if (conf->resync_thread)
1812                 md_unregister_thread(conf->resync_thread);
1813         raid1_shrink_r1bh(conf);
1814         raid1_shrink_bh(conf);
1815         raid1_shrink_buffers(conf);
1816         kfree(conf);
1817         mddev->private = NULL;
1818         MOD_DEC_USE_COUNT;
1819         return 0;
1820 }
1821
1822 static mdk_personality_t raid1_personality=
1823 {
1824         name:           "raid1",
1825         make_request:   raid1_make_request,
1826         run:            raid1_run,
1827         stop:           raid1_stop,
1828         status:         raid1_status,
1829         error_handler:  raid1_error,
1830         diskop:         raid1_diskop,
1831         stop_resync:    raid1_stop_resync,
1832         restart_resync: raid1_restart_resync,
1833         sync_request:   raid1_sync_request
1834 };
1835
1836 static int md__init raid1_init (void)
1837 {
1838         return register_md_personality (RAID1, &raid1_personality);
1839 }
1840
1841 static void raid1_exit (void)
1842 {
1843         unregister_md_personality (RAID1);
1844 }
1845
1846 module_init(raid1_init);
1847 module_exit(raid1_exit);
1848 MODULE_LICENSE("GPL");