linux-old/drivers/md/raid1.c
<<
>>
Prefs
   1/*
   2 * raid1.c : Multiple Devices driver for Linux
   3 *
   4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
   5 *
   6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7 *
   8 * RAID-1 management functions.
   9 *
  10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  11 *
  12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  14 *
  15 * This program is free software; you can redistribute it and/or modify
  16 * it under the terms of the GNU General Public License as published by
  17 * the Free Software Foundation; either version 2, or (at your option)
  18 * any later version.
  19 *
  20 * You should have received a copy of the GNU General Public License
  21 * (for example /usr/src/linux/COPYING); if not, write to the Free
  22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23 */
  24
  25#include <linux/module.h>
  26#include <linux/config.h>
  27#include <linux/slab.h>
  28#include <linux/raid/raid1.h>
  29#include <asm/atomic.h>
  30
  31#define MAJOR_NR MD_MAJOR
  32#define MD_DRIVER
  33#define MD_PERSONALITY
  34
  35#define MAX_WORK_PER_DISK 128
  36
  37#define NR_RESERVED_BUFS        32
  38
  39
  40/*
  41 * The following can be used to debug the driver
  42 */
  43#define RAID1_DEBUG     0
  44
  45#if RAID1_DEBUG
  46#define PRINTK(x...)   printk(x)
  47#define inline
  48#define __inline__
  49#else
  50#define PRINTK(x...)  do { } while (0)
  51#endif
  52
  53
  54static mdk_personality_t raid1_personality;
  55static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
  56struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
  57
  58static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
  59{
  60        /* return a linked list of "cnt" struct buffer_heads.
  61         * don't take any off the free list unless we know we can
  62         * get all we need, otherwise we could deadlock
  63         */
  64        struct buffer_head *bh=NULL;
  65
  66        while(cnt) {
  67                struct buffer_head *t;
  68                md_spin_lock_irq(&conf->device_lock);
  69                if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
  70                        while (cnt) {
  71                                t = conf->freebh;
  72                                conf->freebh = t->b_next;
  73                                t->b_next = bh;
  74                                bh = t;
  75                                t->b_state = 0;
  76                                conf->freebh_cnt--;
  77                                cnt--;
  78                        }
  79                md_spin_unlock_irq(&conf->device_lock);
  80                if (cnt == 0)
  81                        break;
  82                t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
  83                if (t) {
  84                        t->b_next = bh;
  85                        bh = t;
  86                        cnt--;
  87                } else {
  88                        PRINTK("raid1: waiting for %d bh\n", cnt);
  89                        conf->freebh_blocked = 1;
  90                        wait_disk_event(conf->wait_buffer,
  91                                        !conf->freebh_blocked ||
  92                                        conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
  93                        conf->freebh_blocked = 0;
  94                }
  95        }
  96        return bh;
  97}
  98
  99static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
 100{
 101        unsigned long flags;
 102        spin_lock_irqsave(&conf->device_lock, flags);
 103        while (bh) {
 104                struct buffer_head *t = bh;
 105                bh=bh->b_next;
 106                if (t->b_pprev == NULL)
 107                        kmem_cache_free(bh_cachep, t);
 108                else {
 109                        t->b_next= conf->freebh;
 110                        conf->freebh = t;
 111                        conf->freebh_cnt++;
 112                }
 113        }
 114        spin_unlock_irqrestore(&conf->device_lock, flags);
 115        wake_up(&conf->wait_buffer);
 116}
 117
 118static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
 119{
 120        /* allocate cnt buffer_heads, possibly less if kmalloc fails */
 121        int i = 0;
 122
 123        while (i < cnt) {
 124                struct buffer_head *bh;
 125                bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
 126                if (!bh) break;
 127
 128                md_spin_lock_irq(&conf->device_lock);
 129                bh->b_pprev = &conf->freebh;
 130                bh->b_next = conf->freebh;
 131                conf->freebh = bh;
 132                conf->freebh_cnt++;
 133                md_spin_unlock_irq(&conf->device_lock);
 134
 135                i++;
 136        }
 137        return i;
 138}
 139
 140static void raid1_shrink_bh(raid1_conf_t *conf)
 141{
 142        /* discard all buffer_heads */
 143
 144        md_spin_lock_irq(&conf->device_lock);
 145        while (conf->freebh) {
 146                struct buffer_head *bh = conf->freebh;
 147                conf->freebh = bh->b_next;
 148                kmem_cache_free(bh_cachep, bh);
 149                conf->freebh_cnt--;
 150        }
 151        md_spin_unlock_irq(&conf->device_lock);
 152}
 153                
 154
 155static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
 156{
 157        struct raid1_bh *r1_bh = NULL;
 158
 159        do {
 160                md_spin_lock_irq(&conf->device_lock);
 161                if (!conf->freer1_blocked && conf->freer1) {
 162                        r1_bh = conf->freer1;
 163                        conf->freer1 = r1_bh->next_r1;
 164                        conf->freer1_cnt--;
 165                        r1_bh->next_r1 = NULL;
 166                        r1_bh->state = (1 << R1BH_PreAlloc);
 167                        r1_bh->bh_req.b_state = 0;
 168                }
 169                md_spin_unlock_irq(&conf->device_lock);
 170                if (r1_bh)
 171                        return r1_bh;
 172                r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
 173                if (r1_bh) {
 174                        memset(r1_bh, 0, sizeof(*r1_bh));
 175                        return r1_bh;
 176                }
 177                conf->freer1_blocked = 1;
 178                wait_disk_event(conf->wait_buffer,
 179                                !conf->freer1_blocked ||
 180                                conf->freer1_cnt > NR_RESERVED_BUFS/2
 181                        );
 182                conf->freer1_blocked = 0;
 183        } while (1);
 184}
 185
 186static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
 187{
 188        struct buffer_head *bh = r1_bh->mirror_bh_list;
 189        raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 190
 191        r1_bh->mirror_bh_list = NULL;
 192
 193        if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
 194                unsigned long flags;
 195                spin_lock_irqsave(&conf->device_lock, flags);
 196                r1_bh->next_r1 = conf->freer1;
 197                conf->freer1 = r1_bh;
 198                conf->freer1_cnt++;
 199                spin_unlock_irqrestore(&conf->device_lock, flags);
 200                /* don't need to wakeup wait_buffer because
 201                 *  raid1_free_bh below will do that
 202                 */
 203        } else {
 204                kfree(r1_bh);
 205        }
 206        raid1_free_bh(conf, bh);
 207}
 208
 209static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
 210{
 211        int i = 0;
 212
 213        while (i < cnt) {
 214                struct raid1_bh *r1_bh;
 215                r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 216                if (!r1_bh)
 217                        break;
 218                memset(r1_bh, 0, sizeof(*r1_bh));
 219                set_bit(R1BH_PreAlloc, &r1_bh->state);
 220                r1_bh->mddev = conf->mddev;
 221
 222                raid1_free_r1bh(r1_bh);
 223                i++;
 224        }
 225        return i;
 226}
 227
 228static void raid1_shrink_r1bh(raid1_conf_t *conf)
 229{
 230        md_spin_lock_irq(&conf->device_lock);
 231        while (conf->freer1) {
 232                struct raid1_bh *r1_bh = conf->freer1;
 233                conf->freer1 = r1_bh->next_r1;
 234                conf->freer1_cnt--;
 235                kfree(r1_bh);
 236        }
 237        md_spin_unlock_irq(&conf->device_lock);
 238}
 239
 240
 241
 242static inline void raid1_free_buf(struct raid1_bh *r1_bh)
 243{
 244        unsigned long flags;
 245        struct buffer_head *bh = r1_bh->mirror_bh_list;
 246        raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 247        r1_bh->mirror_bh_list = NULL;
 248        
 249        spin_lock_irqsave(&conf->device_lock, flags);
 250        r1_bh->next_r1 = conf->freebuf;
 251        conf->freebuf = r1_bh;
 252        spin_unlock_irqrestore(&conf->device_lock, flags);
 253        raid1_free_bh(conf, bh);
 254}
 255
 256static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
 257{
 258        struct raid1_bh *r1_bh;
 259
 260        md_spin_lock_irq(&conf->device_lock);
 261        wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
 262        r1_bh = conf->freebuf;
 263        conf->freebuf = r1_bh->next_r1;
 264        r1_bh->next_r1= NULL;
 265        md_spin_unlock_irq(&conf->device_lock);
 266
 267        return r1_bh;
 268}
 269
 270static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 271{
 272        int i = 0;
 273        struct raid1_bh *head = NULL, **tail;
 274        tail = &head;
 275
 276        while (i < cnt) {
 277                struct raid1_bh *r1_bh;
 278                struct page *page;
 279
 280                page = alloc_page(GFP_KERNEL);
 281                if (!page)
 282                        break;
 283
 284                r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 285                if (!r1_bh) {
 286                        __free_page(page);
 287                        break;
 288                }
 289                memset(r1_bh, 0, sizeof(*r1_bh));
 290                r1_bh->bh_req.b_page = page;
 291                r1_bh->bh_req.b_data = page_address(page);
 292                *tail = r1_bh;
 293                r1_bh->next_r1 = NULL;
 294                tail = & r1_bh->next_r1;
 295                i++;
 296        }
 297        /* this lock probably isn't needed, as at the time when
 298         * we are allocating buffers, nobody else will be touching the
 299         * freebuf list.  But it doesn't hurt....
 300         */
 301        md_spin_lock_irq(&conf->device_lock);
 302        *tail = conf->freebuf;
 303        conf->freebuf = head;
 304        md_spin_unlock_irq(&conf->device_lock);
 305        return i;
 306}
 307
 308static void raid1_shrink_buffers (raid1_conf_t *conf)
 309{
 310        struct raid1_bh *head;
 311        md_spin_lock_irq(&conf->device_lock);
 312        head = conf->freebuf;
 313        conf->freebuf = NULL;
 314        md_spin_unlock_irq(&conf->device_lock);
 315
 316        while (head) {
 317                struct raid1_bh *r1_bh = head;
 318                head = r1_bh->next_r1;
 319                __free_page(r1_bh->bh_req.b_page);
 320                kfree(r1_bh);
 321        }
 322}
 323
 324static int raid1_map (mddev_t *mddev, kdev_t *rdev)
 325{
 326        raid1_conf_t *conf = mddev_to_conf(mddev);
 327        int i, disks = MD_SB_DISKS;
 328
 329        /*
 330         * Later we do read balancing on the read side 
 331         * now we use the first available disk.
 332         */
 333
 334        for (i = 0; i < disks; i++) {
 335                if (conf->mirrors[i].operational) {
 336                        *rdev = conf->mirrors[i].dev;
 337                        return (0);
 338                }
 339        }
 340
 341        printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
 342        return (-1);
 343}
 344
 345static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
 346{
 347        unsigned long flags;
 348        mddev_t *mddev = r1_bh->mddev;
 349        raid1_conf_t *conf = mddev_to_conf(mddev);
 350
 351        md_spin_lock_irqsave(&retry_list_lock, flags);
 352        if (raid1_retry_list == NULL)
 353                raid1_retry_tail = &raid1_retry_list;
 354        *raid1_retry_tail = r1_bh;
 355        raid1_retry_tail = &r1_bh->next_r1;
 356        r1_bh->next_r1 = NULL;
 357        md_spin_unlock_irqrestore(&retry_list_lock, flags);
 358        md_wakeup_thread(conf->thread);
 359}
 360
 361
 362static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
 363{
 364        unsigned long flags;
 365        spin_lock_irqsave(&conf->segment_lock, flags);
 366        if (sector < conf->start_active)
 367                conf->cnt_done--;
 368        else if (sector >= conf->start_future && conf->phase == phase)
 369                conf->cnt_future--;
 370        else if (!--conf->cnt_pending)
 371                wake_up(&conf->wait_ready);
 372
 373        spin_unlock_irqrestore(&conf->segment_lock, flags);
 374}
 375
 376static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
 377{
 378        unsigned long flags;
 379        spin_lock_irqsave(&conf->segment_lock, flags);
 380        if (sector >= conf->start_ready)
 381                --conf->cnt_ready;
 382        else if (sector >= conf->start_active) {
 383                if (!--conf->cnt_active) {
 384                        conf->start_active = conf->start_ready;
 385                        wake_up(&conf->wait_done);
 386                }
 387        }
 388        spin_unlock_irqrestore(&conf->segment_lock, flags);
 389}
 390
 391/*
 392 * raid1_end_bh_io() is called when we have finished servicing a mirrored
 393 * operation and are ready to return a success/failure code to the buffer
 394 * cache layer.
 395 */
 396static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
 397{
 398        struct buffer_head *bh = r1_bh->master_bh;
 399
 400        io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
 401                        test_bit(R1BH_SyncPhase, &r1_bh->state));
 402
 403        bh->b_end_io(bh, uptodate);
 404        raid1_free_r1bh(r1_bh);
 405}
 406void raid1_end_request (struct buffer_head *bh, int uptodate)
 407{
 408        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
 409
 410        /*
 411         * this branch is our 'one mirror IO has finished' event handler:
 412         */
 413        if (!uptodate)
 414                md_error (r1_bh->mddev, bh->b_dev);
 415        else
 416                /*
 417                 * Set R1BH_Uptodate in our master buffer_head, so that
 418                 * we will return a good error code for to the higher
 419                 * levels even if IO on some other mirrored buffer fails.
 420                 *
 421                 * The 'master' represents the complex operation to 
 422                 * user-side. So if something waits for IO, then it will
 423                 * wait for the 'master' buffer_head.
 424                 */
 425                set_bit (R1BH_Uptodate, &r1_bh->state);
 426
 427        /*
 428         * We split up the read and write side, imho they are 
 429         * conceptually different.
 430         */
 431
 432        if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
 433                /*
 434                 * we have only one buffer_head on the read side
 435                 */
 436                
 437                if (uptodate) {
 438                        raid1_end_bh_io(r1_bh, uptodate);
 439                        return;
 440                }
 441                /*
 442                 * oops, read error:
 443                 */
 444                printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
 445                         partition_name(bh->b_dev), bh->b_blocknr);
 446                raid1_reschedule_retry(r1_bh);
 447                return;
 448        }
 449
 450        /*
 451         * WRITE:
 452         *
 453         * Let's see if all mirrored write operations have finished 
 454         * already.
 455         */
 456
 457        if (atomic_dec_and_test(&r1_bh->remaining))
 458                raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
 459}
 460
 461/*
 462 * This routine returns the disk from which the requested read should
 463 * be done. It bookkeeps the last read position for every disk
 464 * in array and when new read requests come, the disk which last
 465 * position is nearest to the request, is chosen.
 466 *
 467 * TODO: now if there are 2 mirrors in the same 2 devices, performance
 468 * degrades dramatically because position is mirror, not device based.
 469 * This should be changed to be device based. Also atomic sequential
 470 * reads should be somehow balanced.
 471 */
 472
 473static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
 474{
 475        int new_disk = conf->last_used;
 476        const int sectors = bh->b_size >> 9;
 477        const unsigned long this_sector = bh->b_rsector;
 478        int disk = new_disk;
 479        unsigned long new_distance;
 480        unsigned long current_distance;
 481        
 482        /*
 483         * Check if it is sane at all to balance
 484         */
 485        
 486        if (conf->resync_mirrors)
 487                goto rb_out;
 488        
 489
 490        /* make sure that disk is operational */
 491        while( !conf->mirrors[new_disk].operational) {
 492                if (new_disk <= 0) new_disk = conf->raid_disks;
 493                new_disk--;
 494                if (new_disk == disk) {
 495                        /*
 496                         * This means no working disk was found
 497                         * Nothing much to do, lets not change anything
 498                         * and hope for the best...
 499                         */
 500                        
 501                        new_disk = conf->last_used;
 502
 503                        goto rb_out;
 504                }
 505        }
 506        disk = new_disk;
 507        /* now disk == new_disk == starting point for search */
 508        
 509        /*
 510         * Don't touch anything for sequential reads.
 511         */
 512
 513        if (this_sector == conf->mirrors[new_disk].head_position)
 514                goto rb_out;
 515        
 516        /*
 517         * If reads have been done only on a single disk
 518         * for a time, lets give another disk a change.
 519         * This is for kicking those idling disks so that
 520         * they would find work near some hotspot.
 521         */
 522        
 523        if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
 524                conf->sect_count = 0;
 525
 526#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
 527                /* Work around a compiler bug in egcs-2.92.11 19980921 */
 528                new_disk = *(volatile int *)&new_disk;
 529#endif
 530                do {
 531                        if (new_disk<=0)
 532                                new_disk = conf->raid_disks;
 533                        new_disk--;
 534                        if (new_disk == disk)
 535                                break;
 536                } while ((conf->mirrors[new_disk].write_only) ||
 537                         (!conf->mirrors[new_disk].operational));
 538
 539                goto rb_out;
 540        }
 541        
 542        current_distance = abs(this_sector -
 543                                conf->mirrors[disk].head_position);
 544        
 545        /* Find the disk which is closest */
 546        
 547        do {
 548                if (disk <= 0)
 549                        disk = conf->raid_disks;
 550                disk--;
 551                
 552                if ((conf->mirrors[disk].write_only) ||
 553                                (!conf->mirrors[disk].operational))
 554                        continue;
 555                
 556                new_distance = abs(this_sector -
 557                                        conf->mirrors[disk].head_position);
 558                
 559                if (new_distance < current_distance) {
 560                        conf->sect_count = 0;
 561                        current_distance = new_distance;
 562                        new_disk = disk;
 563                }
 564        } while (disk != conf->last_used);
 565
 566rb_out:
 567        conf->mirrors[new_disk].head_position = this_sector + sectors;
 568
 569        conf->last_used = new_disk;
 570        conf->sect_count += sectors;
 571
 572        return new_disk;
 573}
 574
 575static int raid1_make_request (mddev_t *mddev, int rw,
 576                               struct buffer_head * bh)
 577{
 578        raid1_conf_t *conf = mddev_to_conf(mddev);
 579        struct buffer_head *bh_req, *bhl;
 580        struct raid1_bh * r1_bh;
 581        int disks = MD_SB_DISKS;
 582        int i, sum_bhs = 0;
 583        struct mirror_info *mirror;
 584
 585        if (!buffer_locked(bh))
 586                BUG();
 587        
 588/*
 589 * make_request() can abort the operation when READA is being
 590 * used and no empty request is available.
 591 *
 592 * Currently, just replace the command with READ/WRITE.
 593 */
 594        if (rw == READA)
 595                rw = READ;
 596
 597        r1_bh = raid1_alloc_r1bh (conf);
 598
 599        spin_lock_irq(&conf->segment_lock);
 600        wait_event_lock_irq(conf->wait_done,
 601                        bh->b_rsector < conf->start_active ||
 602                        bh->b_rsector >= conf->start_future,
 603                        conf->segment_lock);
 604        if (bh->b_rsector < conf->start_active) 
 605                conf->cnt_done++;
 606        else {
 607                conf->cnt_future++;
 608                if (conf->phase)
 609                        set_bit(R1BH_SyncPhase, &r1_bh->state);
 610        }
 611        spin_unlock_irq(&conf->segment_lock);
 612        
 613        /*
 614         * i think the read and write branch should be separated completely,
 615         * since we want to do read balancing on the read side for example.
 616         * Alternative implementations? :) --mingo
 617         */
 618
 619        r1_bh->master_bh = bh;
 620        r1_bh->mddev = mddev;
 621        r1_bh->cmd = rw;
 622
 623        if (rw == READ) {
 624                /*
 625                 * read balancing logic:
 626                 */
 627                mirror = conf->mirrors + raid1_read_balance(conf, bh);
 628
 629                bh_req = &r1_bh->bh_req;
 630                memcpy(bh_req, bh, sizeof(*bh));
 631                bh_req->b_blocknr = bh->b_rsector;
 632                bh_req->b_dev = mirror->dev;
 633                bh_req->b_rdev = mirror->dev;
 634        /*      bh_req->b_rsector = bh->n_rsector; */
 635                bh_req->b_end_io = raid1_end_request;
 636                bh_req->b_private = r1_bh;
 637                generic_make_request (rw, bh_req);
 638                return 0;
 639        }
 640
 641        /*
 642         * WRITE:
 643         */
 644
 645        bhl = raid1_alloc_bh(conf, conf->raid_disks);
 646        for (i = 0; i < disks; i++) {
 647                struct buffer_head *mbh;
 648                if (!conf->mirrors[i].operational) 
 649                        continue;
 650 
 651        /*
 652         * We should use a private pool (size depending on NR_REQUEST),
 653         * to avoid writes filling up the memory with bhs
 654         *
 655         * Such pools are much faster than kmalloc anyways (so we waste
 656         * almost nothing by not using the master bh when writing and
 657         * win alot of cleanness) but for now we are cool enough. --mingo
 658         *
 659         * It's safe to sleep here, buffer heads cannot be used in a shared
 660         * manner in the write branch. Look how we lock the buffer at the
 661         * beginning of this function to grok the difference ;)
 662         */
 663                mbh = bhl;
 664                if (mbh == NULL) {
 665                        MD_BUG();
 666                        break;
 667                }
 668                bhl = mbh->b_next;
 669                mbh->b_next = NULL;
 670                mbh->b_this_page = (struct buffer_head *)1;
 671                
 672        /*
 673         * prepare mirrored mbh (fields ordered for max mem throughput):
 674         */
 675                mbh->b_blocknr    = bh->b_rsector;
 676                mbh->b_dev        = conf->mirrors[i].dev;
 677                mbh->b_rdev       = conf->mirrors[i].dev;
 678                mbh->b_rsector    = bh->b_rsector;
 679                mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
 680                                                (1<<BH_Mapped) | (1<<BH_Lock);
 681
 682                atomic_set(&mbh->b_count, 1);
 683                mbh->b_size       = bh->b_size;
 684                mbh->b_page       = bh->b_page;
 685                mbh->b_data       = bh->b_data;
 686                mbh->b_list       = BUF_LOCKED;
 687                mbh->b_end_io     = raid1_end_request;
 688                mbh->b_private    = r1_bh;
 689
 690                mbh->b_next = r1_bh->mirror_bh_list;
 691                r1_bh->mirror_bh_list = mbh;
 692                sum_bhs++;
 693        }
 694        if (bhl) raid1_free_bh(conf,bhl);
 695        if (!sum_bhs) {
 696                /* Gag - all mirrors non-operational.. */
 697                raid1_end_bh_io(r1_bh, 0);
 698                return 0;
 699        }
 700        md_atomic_set(&r1_bh->remaining, sum_bhs);
 701
 702        /*
 703         * We have to be a bit careful about the semaphore above, thats
 704         * why we start the requests separately. Since kmalloc() could
 705         * fail, sleep and make_request() can sleep too, this is the
 706         * safer solution. Imagine, end_request decreasing the semaphore
 707         * before we could have set it up ... We could play tricks with
 708         * the semaphore (presetting it and correcting at the end if
 709         * sum_bhs is not 'n' but we have to do end_request by hand if
 710         * all requests finish until we had a chance to set up the
 711         * semaphore correctly ... lots of races).
 712         */
 713        bh = r1_bh->mirror_bh_list;
 714        while(bh) {
 715                struct buffer_head *bh2 = bh;
 716                bh = bh->b_next;
 717                generic_make_request(rw, bh2);
 718        }
 719        return (0);
 720}
 721
 722static void raid1_status(struct seq_file *seq, mddev_t *mddev)
 723{
 724        raid1_conf_t *conf = mddev_to_conf(mddev);
 725        int i;
 726        
 727        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 728                                                 conf->working_disks);
 729        for (i = 0; i < conf->raid_disks; i++)
 730                seq_printf(seq, "%s",
 731                        conf->mirrors[i].operational ? "U" : "_");
 732        seq_printf(seq, "]");
 733}
 734
 735#define LAST_DISK KERN_ALERT \
 736"raid1: only one disk left and IO error.\n"
 737
 738#define NO_SPARE_DISK KERN_ALERT \
 739"raid1: no spare disk left, degrading mirror level by one.\n"
 740
 741#define DISK_FAILED KERN_ALERT \
 742"raid1: Disk failure on %s, disabling device. \n" \
 743"       Operation continuing on %d devices\n"
 744
 745#define START_SYNCING KERN_ALERT \
 746"raid1: start syncing spare disk.\n"
 747
 748#define ALREADY_SYNCING KERN_INFO \
 749"raid1: syncing already in progress.\n"
 750
 751static void mark_disk_bad (mddev_t *mddev, int failed)
 752{
 753        raid1_conf_t *conf = mddev_to_conf(mddev);
 754        struct mirror_info *mirror = conf->mirrors+failed;
 755        mdp_super_t *sb = mddev->sb;
 756
 757        mirror->operational = 0;
 758        mark_disk_faulty(sb->disks+mirror->number);
 759        mark_disk_nonsync(sb->disks+mirror->number);
 760        mark_disk_inactive(sb->disks+mirror->number);
 761        if (!mirror->write_only)
 762                sb->active_disks--;
 763        sb->working_disks--;
 764        sb->failed_disks++;
 765        mddev->sb_dirty = 1;
 766        md_wakeup_thread(conf->thread);
 767        if (!mirror->write_only)
 768                conf->working_disks--;
 769        printk (DISK_FAILED, partition_name (mirror->dev),
 770                                 conf->working_disks);
 771}
 772
 773static int raid1_error (mddev_t *mddev, kdev_t dev)
 774{
 775        raid1_conf_t *conf = mddev_to_conf(mddev);
 776        struct mirror_info * mirrors = conf->mirrors;
 777        int disks = MD_SB_DISKS;
 778        int i;
 779
 780        /* Find the drive.
 781         * If it is not operational, then we have already marked it as dead
 782         * else if it is the last working disks, ignore the error, let the
 783         * next level up know.
 784         * else mark the drive as failed
 785         */
 786
 787        for (i = 0; i < disks; i++)
 788                if (mirrors[i].dev==dev && mirrors[i].operational)
 789                        break;
 790        if (i == disks)
 791                return 0;
 792
 793        if (i < conf->raid_disks && conf->working_disks == 1) {
 794                /* Don't fail the drive, act as though we were just a
 795                 * normal single drive
 796                 */
 797
 798                return 1;
 799        }
 800        mark_disk_bad(mddev, i);
 801        return 0;
 802}
 803
 804#undef LAST_DISK
 805#undef NO_SPARE_DISK
 806#undef DISK_FAILED
 807#undef START_SYNCING
 808
 809
 810static void print_raid1_conf (raid1_conf_t *conf)
 811{
 812        int i;
 813        struct mirror_info *tmp;
 814
 815        printk("RAID1 conf printout:\n");
 816        if (!conf) {
 817                printk("(conf==NULL)\n");
 818                return;
 819        }
 820        printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 821                         conf->raid_disks, conf->nr_disks);
 822
 823        for (i = 0; i < MD_SB_DISKS; i++) {
 824                tmp = conf->mirrors + i;
 825                printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 826                        i, tmp->spare,tmp->operational,
 827                        tmp->number,tmp->raid_disk,tmp->used_slot,
 828                        partition_name(tmp->dev));
 829        }
 830}
 831
 832static void close_sync(raid1_conf_t *conf)
 833{
 834        mddev_t *mddev = conf->mddev;
 835        /* If reconstruction was interrupted, we need to close the "active" and "pending"
 836         * holes.
 837         * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
 838         */
 839        /* this is really needed when recovery stops too... */
 840        spin_lock_irq(&conf->segment_lock);
 841        conf->start_active = conf->start_pending;
 842        conf->start_ready = conf->start_pending;
 843        wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 844        conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
 845        conf->start_future = (mddev->sb->size<<1)+1;
 846        conf->cnt_pending = conf->cnt_future;
 847        conf->cnt_future = 0;
 848        conf->phase = conf->phase ^1;
 849        wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 850        conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
 851        conf->phase = 0;
 852        conf->cnt_future = conf->cnt_done;;
 853        conf->cnt_done = 0;
 854        spin_unlock_irq(&conf->segment_lock);
 855        wake_up(&conf->wait_done);
 856}
 857
 858static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 859{
 860        int err = 0;
 861        int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 862        raid1_conf_t *conf = mddev->private;
 863        struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 864        mdp_super_t *sb = mddev->sb;
 865        mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 866        mdk_rdev_t *spare_rdev, *failed_rdev;
 867
 868        print_raid1_conf(conf);
 869
 870        switch (state) {
 871        case DISKOP_SPARE_ACTIVE:
 872        case DISKOP_SPARE_INACTIVE:
 873                /* need to wait for pending sync io before locking device */
 874                close_sync(conf);
 875        }
 876
 877        md_spin_lock_irq(&conf->device_lock);
 878        /*
 879         * find the disk ...
 880         */
 881        switch (state) {
 882
 883        case DISKOP_SPARE_ACTIVE:
 884
 885                /*
 886                 * Find the failed disk within the RAID1 configuration ...
 887                 * (this can only be in the first conf->working_disks part)
 888                 */
 889                for (i = 0; i < conf->raid_disks; i++) {
 890                        tmp = conf->mirrors + i;
 891                        if ((!tmp->operational && !tmp->spare) ||
 892                                        !tmp->used_slot) {
 893                                failed_disk = i;
 894                                break;
 895                        }
 896                }
 897                /*
 898                 * When we activate a spare disk we _must_ have a disk in
 899                 * the lower (active) part of the array to replace. 
 900                 */
 901                if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 902                        MD_BUG();
 903                        err = 1;
 904                        goto abort;
 905                }
 906                /* fall through */
 907
 908        case DISKOP_SPARE_WRITE:
 909        case DISKOP_SPARE_INACTIVE:
 910
 911                /*
 912                 * Find the spare disk ... (can only be in the 'high'
 913                 * area of the array)
 914                 */
 915                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 916                        tmp = conf->mirrors + i;
 917                        if (tmp->spare && tmp->number == (*d)->number) {
 918                                spare_disk = i;
 919                                break;
 920                        }
 921                }
 922                if (spare_disk == -1) {
 923                        MD_BUG();
 924                        err = 1;
 925                        goto abort;
 926                }
 927                break;
 928
 929        case DISKOP_HOT_REMOVE_DISK:
 930
 931                for (i = 0; i < MD_SB_DISKS; i++) {
 932                        tmp = conf->mirrors + i;
 933                        if (tmp->used_slot && (tmp->number == (*d)->number)) {
 934                                if (tmp->operational) {
 935                                        err = -EBUSY;
 936                                        goto abort;
 937                                }
 938                                removed_disk = i;
 939                                break;
 940                        }
 941                }
 942                if (removed_disk == -1) {
 943                        MD_BUG();
 944                        err = 1;
 945                        goto abort;
 946                }
 947                break;
 948
 949        case DISKOP_HOT_ADD_DISK:
 950
 951                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 952                        tmp = conf->mirrors + i;
 953                        if (!tmp->used_slot) {
 954                                added_disk = i;
 955                                break;
 956                        }
 957                }
 958                if (added_disk == -1) {
 959                        MD_BUG();
 960                        err = 1;
 961                        goto abort;
 962                }
 963                break;
 964        }
 965
 966        switch (state) {
 967        /*
 968         * Switch the spare disk to write-only mode:
 969         */
 970        case DISKOP_SPARE_WRITE:
 971                sdisk = conf->mirrors + spare_disk;
 972                sdisk->operational = 1;
 973                sdisk->write_only = 1;
 974                break;
 975        /*
 976         * Deactivate a spare disk:
 977         */
 978        case DISKOP_SPARE_INACTIVE:
 979                if (conf->start_future > 0) {
 980                        MD_BUG();
 981                        err = -EBUSY;
 982                        break;
 983                }
 984                sdisk = conf->mirrors + spare_disk;
 985                sdisk->operational = 0;
 986                sdisk->write_only = 0;
 987                break;
 988        /*
 989         * Activate (mark read-write) the (now sync) spare disk,
 990         * which means we switch it's 'raid position' (->raid_disk)
 991         * with the failed disk. (only the first 'conf->nr_disks'
 992         * slots are used for 'real' disks and we must preserve this
 993         * property)
 994         */
 995        case DISKOP_SPARE_ACTIVE:
 996                if (conf->start_future > 0) {
 997                        MD_BUG();
 998                        err = -EBUSY;
 999                        break;
1000                }
1001                sdisk = conf->mirrors + spare_disk;
1002                fdisk = conf->mirrors + failed_disk;
1003
1004                spare_desc = &sb->disks[sdisk->number];
1005                failed_desc = &sb->disks[fdisk->number];
1006
1007                if (spare_desc != *d) {
1008                        MD_BUG();
1009                        err = 1;
1010                        goto abort;
1011                }
1012
1013                if (spare_desc->raid_disk != sdisk->raid_disk) {
1014                        MD_BUG();
1015                        err = 1;
1016                        goto abort;
1017                }
1018                        
1019                if (sdisk->raid_disk != spare_disk) {
1020                        MD_BUG();
1021                        err = 1;
1022                        goto abort;
1023                }
1024
1025                if (failed_desc->raid_disk != fdisk->raid_disk) {
1026                        MD_BUG();
1027                        err = 1;
1028                        goto abort;
1029                }
1030
1031                if (fdisk->raid_disk != failed_disk) {
1032                        MD_BUG();
1033                        err = 1;
1034                        goto abort;
1035                }
1036
1037                /*
1038                 * do the switch finally
1039                 */
1040                spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1041                failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1042
1043                /* There must be a spare_rdev, but there may not be a
1044                 * failed_rdev.  That slot might be empty...
1045                 */
1046                spare_rdev->desc_nr = failed_desc->number;
1047                if (failed_rdev)
1048                        failed_rdev->desc_nr = spare_desc->number;
1049                
1050                xchg_values(*spare_desc, *failed_desc);
1051                xchg_values(*fdisk, *sdisk);
1052
1053                /*
1054                 * (careful, 'failed' and 'spare' are switched from now on)
1055                 *
1056                 * we want to preserve linear numbering and we want to
1057                 * give the proper raid_disk number to the now activated
1058                 * disk. (this means we switch back these values)
1059                 */
1060        
1061                xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1062                xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1063                xchg_values(spare_desc->number, failed_desc->number);
1064                xchg_values(sdisk->number, fdisk->number);
1065
1066                *d = failed_desc;
1067
1068                if (sdisk->dev == MKDEV(0,0))
1069                        sdisk->used_slot = 0;
1070                /*
1071                 * this really activates the spare.
1072                 */
1073                fdisk->spare = 0;
1074                fdisk->write_only = 0;
1075
1076                /*
1077                 * if we activate a spare, we definitely replace a
1078                 * non-operational disk slot in the 'low' area of
1079                 * the disk array.
1080                 */
1081
1082                conf->working_disks++;
1083
1084                break;
1085
1086        case DISKOP_HOT_REMOVE_DISK:
1087                rdisk = conf->mirrors + removed_disk;
1088
1089                if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1090                        MD_BUG();       
1091                        err = 1;
1092                        goto abort;
1093                }
1094                rdisk->dev = MKDEV(0,0);
1095                rdisk->used_slot = 0;
1096                conf->nr_disks--;
1097                break;
1098
1099        case DISKOP_HOT_ADD_DISK:
1100                adisk = conf->mirrors + added_disk;
1101                added_desc = *d;
1102
1103                if (added_disk != added_desc->number) {
1104                        MD_BUG();       
1105                        err = 1;
1106                        goto abort;
1107                }
1108
1109                adisk->number = added_desc->number;
1110                adisk->raid_disk = added_desc->raid_disk;
1111                adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1112
1113                adisk->operational = 0;
1114                adisk->write_only = 0;
1115                adisk->spare = 1;
1116                adisk->used_slot = 1;
1117                adisk->head_position = 0;
1118                conf->nr_disks++;
1119
1120                break;
1121
1122        default:
1123                MD_BUG();       
1124                err = 1;
1125                goto abort;
1126        }
1127abort:
1128        md_spin_unlock_irq(&conf->device_lock);
1129        if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1130                /* should move to "END_REBUILD" when such exists */
1131                raid1_shrink_buffers(conf);
1132
1133        print_raid1_conf(conf);
1134        return err;
1135}
1136
1137
1138#define IO_ERROR KERN_ALERT \
1139"raid1: %s: unrecoverable I/O read error for block %lu\n"
1140
1141#define REDIRECT_SECTOR KERN_ERR \
1142"raid1: %s: redirecting sector %lu to another mirror\n"
1143
1144/*
1145 * This is a kernel thread which:
1146 *
1147 *      1.      Retries failed read operations on working mirrors.
1148 *      2.      Updates the raid superblock when problems encounter.
1149 *      3.      Performs writes following reads for array syncronising.
1150 */
1151static void end_sync_write(struct buffer_head *bh, int uptodate);
1152static void end_sync_read(struct buffer_head *bh, int uptodate);
1153
1154static void raid1d (void *data)
1155{
1156        struct raid1_bh *r1_bh;
1157        struct buffer_head *bh;
1158        unsigned long flags;
1159        raid1_conf_t *conf = data;
1160        mddev_t *mddev = conf->mddev;
1161        kdev_t dev;
1162
1163        if (mddev->sb_dirty)
1164                md_update_sb(mddev);
1165
1166        for (;;) {
1167                md_spin_lock_irqsave(&retry_list_lock, flags);
1168                r1_bh = raid1_retry_list;
1169                if (!r1_bh)
1170                        break;
1171                raid1_retry_list = r1_bh->next_r1;
1172                md_spin_unlock_irqrestore(&retry_list_lock, flags);
1173
1174                mddev = r1_bh->mddev;
1175                bh = &r1_bh->bh_req;
1176                switch(r1_bh->cmd) {
1177                case SPECIAL:
1178                        /* have to allocate lots of bh structures and
1179                         * schedule writes
1180                         */
1181                        if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1182                                int i, sum_bhs = 0;
1183                                int disks = MD_SB_DISKS;
1184                                struct buffer_head *bhl, *mbh;
1185                                
1186                                conf = mddev_to_conf(mddev);
1187                                bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1188                                for (i = 0; i < disks ; i++) {
1189                                        if (!conf->mirrors[i].operational)
1190                                                continue;
1191                                        if (i==conf->last_used)
1192                                                /* we read from here, no need to write */
1193                                                continue;
1194                                        if (i < conf->raid_disks
1195                                            && !conf->resync_mirrors)
1196                                                /* don't need to write this,
1197                                                 * we are just rebuilding */
1198                                                continue;
1199                                        mbh = bhl;
1200                                        if (!mbh) {
1201                                                MD_BUG();
1202                                                break;
1203                                        }
1204                                        bhl = mbh->b_next;
1205                                        mbh->b_this_page = (struct buffer_head *)1;
1206
1207                                                
1208                                /*
1209                                 * prepare mirrored bh (fields ordered for max mem throughput):
1210                                 */
1211                                        mbh->b_blocknr    = bh->b_blocknr;
1212                                        mbh->b_dev        = conf->mirrors[i].dev;
1213                                        mbh->b_rdev       = conf->mirrors[i].dev;
1214                                        mbh->b_rsector    = bh->b_blocknr;
1215                                        mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1216                                                (1<<BH_Mapped) | (1<<BH_Lock);
1217                                        atomic_set(&mbh->b_count, 1);
1218                                        mbh->b_size       = bh->b_size;
1219                                        mbh->b_page       = bh->b_page;
1220                                        mbh->b_data       = bh->b_data;
1221                                        mbh->b_list       = BUF_LOCKED;
1222                                        mbh->b_end_io     = end_sync_write;
1223                                        mbh->b_private    = r1_bh;
1224
1225                                        mbh->b_next = r1_bh->mirror_bh_list;
1226                                        r1_bh->mirror_bh_list = mbh;
1227
1228                                        sum_bhs++;
1229                                }
1230                                md_atomic_set(&r1_bh->remaining, sum_bhs);
1231                                if (bhl) raid1_free_bh(conf, bhl);
1232                                mbh = r1_bh->mirror_bh_list;
1233
1234                                if (!sum_bhs) {
1235                                        /* nowhere to write this too... I guess we
1236                                         * must be done
1237                                         */
1238                                        sync_request_done(bh->b_blocknr, conf);
1239                                        md_done_sync(mddev, bh->b_size>>9, 0);
1240                                        raid1_free_buf(r1_bh);
1241                                } else
1242                                while (mbh) {
1243                                        struct buffer_head *bh1 = mbh;
1244                                        mbh = mbh->b_next;
1245                                        generic_make_request(WRITE, bh1);
1246                                        md_sync_acct(bh1->b_dev, bh1->b_size/512);
1247                                }
1248                        } else {
1249                                /* There is no point trying a read-for-reconstruct
1250                                 * as reconstruct is about to be aborted
1251                                 */
1252
1253                                printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1254                                md_done_sync(mddev, bh->b_size>>9, 0);
1255                        }
1256
1257                        break;
1258                case READ:
1259                case READA:
1260                        dev = bh->b_dev;
1261                        raid1_map (mddev, &bh->b_dev);
1262                        if (bh->b_dev == dev) {
1263                                printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1264                                raid1_end_bh_io(r1_bh, 0);
1265                        } else {
1266                                printk (REDIRECT_SECTOR,
1267                                        partition_name(bh->b_dev), bh->b_blocknr);
1268                                bh->b_rdev = bh->b_dev;
1269                                bh->b_rsector = bh->b_blocknr;
1270                                generic_make_request (r1_bh->cmd, bh);
1271                        }
1272                        break;
1273                }
1274        }
1275        md_spin_unlock_irqrestore(&retry_list_lock, flags);
1276}
1277#undef IO_ERROR
1278#undef REDIRECT_SECTOR
1279
1280/*
1281 * Private kernel thread to reconstruct mirrors after an unclean
1282 * shutdown.
1283 */
1284static void raid1syncd (void *data)
1285{
1286        raid1_conf_t *conf = data;
1287        mddev_t *mddev = conf->mddev;
1288
1289        if (!conf->resync_mirrors)
1290                return;
1291        if (conf->resync_mirrors == 2)
1292                return;
1293        down(&mddev->recovery_sem);
1294        if (!md_do_sync(mddev, NULL)) {
1295                /*
1296                 * Only if everything went Ok.
1297                 */
1298                conf->resync_mirrors = 0;
1299        }
1300
1301        close_sync(conf);
1302
1303        up(&mddev->recovery_sem);
1304        raid1_shrink_buffers(conf);
1305}
1306
1307/*
1308 * perform a "sync" on one "block"
1309 *
1310 * We need to make sure that no normal I/O request - particularly write
1311 * requests - conflict with active sync requests.
1312 * This is achieved by conceptually dividing the device space into a
1313 * number of sections:
1314 *  DONE: 0 .. a-1     These blocks are in-sync
1315 *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1316 *                     no normal IO requests
1317 *  READY: b .. c-1    These blocks have no normal IO requests - sync
1318 *                     request may be happening
1319 *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1320 *                     ones will be added
1321 *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1322 *                     be happening, but not sync
1323 *
1324 * We keep a
1325 *   phase    which flips (0 or 1) each time d moves and
1326 * a count of:
1327 *   z =  active io requests in FUTURE since d moved - marked with
1328 *        current phase
1329 *   y =  active io requests in FUTURE before d moved, or PENDING -
1330 *        marked with previous phase
1331 *   x =  active sync requests in READY
1332 *   w =  active sync requests in ACTIVE
1333 *   v =  active io requests in DONE
1334 *
1335 * Normally, a=b=c=d=0 and z= active io requests
1336 *   or a=b=c=d=END and v= active io requests
1337 * Allowed changes to a,b,c,d:
1338 * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1339 * B:  y==0 -> c=d
1340 * C:   b=c, w+=x, x=0
1341 * D:  w==0 -> a=b
1342 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1343 *
1344 * At start of sync we apply A.
1345 * When y reaches 0, we apply B then A then being sync requests
1346 * When sync point reaches c-1, we wait for y==0, and W==0, and
1347 * then apply apply B then A then D then C.
1348 * Finally, we apply E
1349 *
1350 * The sync request simply issues a "read" against a working drive
1351 * This is marked so that on completion the raid1d thread is woken to
1352 * issue suitable write requests
1353 */
1354
1355static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1356{
1357        raid1_conf_t *conf = mddev_to_conf(mddev);
1358        struct mirror_info *mirror;
1359        struct raid1_bh *r1_bh;
1360        struct buffer_head *bh;
1361        int bsize;
1362        int disk;
1363        int block_nr;
1364        int buffs;
1365
1366        if (!sector_nr) {
1367                /* we want enough buffers to hold twice the window of 128*/
1368                buffs = 128 *2 / (PAGE_SIZE>>9);
1369                buffs = raid1_grow_buffers(conf, buffs);
1370                if (buffs < 2)
1371                        goto nomem;
1372                conf->window = buffs*(PAGE_SIZE>>9)/2;
1373        }
1374        spin_lock_irq(&conf->segment_lock);
1375        if (!sector_nr) {
1376                /* initialize ...*/
1377                conf->start_active = 0;
1378                conf->start_ready = 0;
1379                conf->start_pending = 0;
1380                conf->start_future = 0;
1381                conf->phase = 0;
1382                
1383                conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1384                conf->cnt_done = conf->cnt_pending = 0;
1385                if (conf->cnt_ready || conf->cnt_active)
1386                        MD_BUG();
1387        }
1388        while (sector_nr >= conf->start_pending) {
1389                PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1390                        sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1391                        conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1392                wait_event_lock_irq(conf->wait_done,
1393                                        !conf->cnt_active,
1394                                        conf->segment_lock);
1395                wait_event_lock_irq(conf->wait_ready,
1396                                        !conf->cnt_pending,
1397                                        conf->segment_lock);
1398                conf->start_active = conf->start_ready;
1399                conf->start_ready = conf->start_pending;
1400                conf->start_pending = conf->start_future;
1401                conf->start_future = conf->start_future+conf->window;
1402                // Note: falling off the end is not a problem
1403                conf->phase = conf->phase ^1;
1404                conf->cnt_active = conf->cnt_ready;
1405                conf->cnt_ready = 0;
1406                conf->cnt_pending = conf->cnt_future;
1407                conf->cnt_future = 0;
1408                wake_up(&conf->wait_done);
1409        }
1410        conf->cnt_ready++;
1411        spin_unlock_irq(&conf->segment_lock);
1412                
1413
1414        /* If reconstructing, and >1 working disc,
1415         * could dedicate one to rebuild and others to
1416         * service read requests ..
1417         */
1418        disk = conf->last_used;
1419        /* make sure disk is operational */
1420        while (!conf->mirrors[disk].operational) {
1421                if (disk <= 0) disk = conf->raid_disks;
1422                disk--;
1423                if (disk == conf->last_used)
1424                        break;
1425        }
1426        conf->last_used = disk;
1427        
1428        mirror = conf->mirrors+conf->last_used;
1429        
1430        r1_bh = raid1_alloc_buf (conf);
1431        r1_bh->master_bh = NULL;
1432        r1_bh->mddev = mddev;
1433        r1_bh->cmd = SPECIAL;
1434        bh = &r1_bh->bh_req;
1435
1436        block_nr = sector_nr;
1437        bsize = 512;
1438        while (!(block_nr & 1) && bsize < PAGE_SIZE
1439                        && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1440                block_nr >>= 1;
1441                bsize <<= 1;
1442        }
1443        bh->b_size = bsize;
1444        bh->b_list = BUF_LOCKED;
1445        bh->b_dev = mirror->dev;
1446        bh->b_rdev = mirror->dev;
1447        bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1448        if (!bh->b_page)
1449                BUG();
1450        if (!bh->b_data)
1451                BUG();
1452        if (bh->b_data != page_address(bh->b_page))
1453                BUG();
1454        bh->b_end_io = end_sync_read;
1455        bh->b_private = r1_bh;
1456        bh->b_blocknr = sector_nr;
1457        bh->b_rsector = sector_nr;
1458        init_waitqueue_head(&bh->b_wait);
1459
1460        generic_make_request(READ, bh);
1461        md_sync_acct(bh->b_dev, bh->b_size/512);
1462
1463        return (bsize >> 9);
1464
1465nomem:
1466        raid1_shrink_buffers(conf);
1467        return -ENOMEM;
1468}
1469
1470static void end_sync_read(struct buffer_head *bh, int uptodate)
1471{
1472        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1473
1474        /* we have read a block, now it needs to be re-written,
1475         * or re-read if the read failed.
1476         * We don't do much here, just schedule handling by raid1d
1477         */
1478        if (!uptodate)
1479                md_error (r1_bh->mddev, bh->b_dev);
1480        else
1481                set_bit(R1BH_Uptodate, &r1_bh->state);
1482        raid1_reschedule_retry(r1_bh);
1483}
1484
1485static void end_sync_write(struct buffer_head *bh, int uptodate)
1486{
1487        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1488        
1489        if (!uptodate)
1490                md_error (r1_bh->mddev, bh->b_dev);
1491        if (atomic_dec_and_test(&r1_bh->remaining)) {
1492                mddev_t *mddev = r1_bh->mddev;
1493                unsigned long sect = bh->b_blocknr;
1494                int size = bh->b_size;
1495                raid1_free_buf(r1_bh);
1496                sync_request_done(sect, mddev_to_conf(mddev));
1497                md_done_sync(mddev,size>>9, uptodate);
1498        }
1499}
1500
1501#define INVALID_LEVEL KERN_WARNING \
1502"raid1: md%d: raid level not set to mirroring (%d)\n"
1503
1504#define NO_SB KERN_ERR \
1505"raid1: disabled mirror %s (couldn't access raid superblock)\n"
1506
1507#define ERRORS KERN_ERR \
1508"raid1: disabled mirror %s (errors detected)\n"
1509
1510#define NOT_IN_SYNC KERN_ERR \
1511"raid1: disabled mirror %s (not in sync)\n"
1512
1513#define INCONSISTENT KERN_ERR \
1514"raid1: disabled mirror %s (inconsistent descriptor)\n"
1515
1516#define ALREADY_RUNNING KERN_ERR \
1517"raid1: disabled mirror %s (mirror %d already operational)\n"
1518
1519#define OPERATIONAL KERN_INFO \
1520"raid1: device %s operational as mirror %d\n"
1521
1522#define MEM_ERROR KERN_ERR \
1523"raid1: couldn't allocate memory for md%d\n"
1524
1525#define SPARE KERN_INFO \
1526"raid1: spare disk %s\n"
1527
1528#define NONE_OPERATIONAL KERN_ERR \
1529"raid1: no operational mirrors for md%d\n"
1530
1531#define ARRAY_IS_ACTIVE KERN_INFO \
1532"raid1: raid set md%d active with %d out of %d mirrors\n"
1533
1534#define THREAD_ERROR KERN_ERR \
1535"raid1: couldn't allocate thread for md%d\n"
1536
1537#define START_RESYNC KERN_WARNING \
1538"raid1: raid set md%d not clean; reconstructing mirrors\n"
1539
1540static int raid1_run (mddev_t *mddev)
1541{
1542        raid1_conf_t *conf;
1543        int i, j, disk_idx;
1544        struct mirror_info *disk;
1545        mdp_super_t *sb = mddev->sb;
1546        mdp_disk_t *descriptor;
1547        mdk_rdev_t *rdev;
1548        struct md_list_head *tmp;
1549        int start_recovery = 0;
1550
1551        MOD_INC_USE_COUNT;
1552
1553        if (sb->level != 1) {
1554                printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1555                goto out;
1556        }
1557        /*
1558         * copy the already verified devices into our private RAID1
1559         * bookkeeping area. [whatever we allocate in raid1_run(),
1560         * should be freed in raid1_stop()]
1561         */
1562
1563        conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1564        mddev->private = conf;
1565        if (!conf) {
1566                printk(MEM_ERROR, mdidx(mddev));
1567                goto out;
1568        }
1569        memset(conf, 0, sizeof(*conf));
1570
1571        ITERATE_RDEV(mddev,rdev,tmp) {
1572                if (rdev->faulty) {
1573                        printk(ERRORS, partition_name(rdev->dev));
1574                } else {
1575                        if (!rdev->sb) {
1576                                MD_BUG();
1577                                continue;
1578                        }
1579                }
1580                if (rdev->desc_nr == -1) {
1581                        MD_BUG();
1582                        continue;
1583                }
1584                descriptor = &sb->disks[rdev->desc_nr];
1585                disk_idx = descriptor->raid_disk;
1586                disk = conf->mirrors + disk_idx;
1587
1588                if (disk_faulty(descriptor)) {
1589                        disk->number = descriptor->number;
1590                        disk->raid_disk = disk_idx;
1591                        disk->dev = rdev->dev;
1592                        disk->sect_limit = MAX_WORK_PER_DISK;
1593                        disk->operational = 0;
1594                        disk->write_only = 0;
1595                        disk->spare = 0;
1596                        disk->used_slot = 1;
1597                        disk->head_position = 0;
1598                        continue;
1599                }
1600                if (disk_active(descriptor)) {
1601                        if (!disk_sync(descriptor)) {
1602                                printk(NOT_IN_SYNC,
1603                                        partition_name(rdev->dev));
1604                                continue;
1605                        }
1606                        if ((descriptor->number > MD_SB_DISKS) ||
1607                                         (disk_idx > sb->raid_disks)) {
1608
1609                                printk(INCONSISTENT,
1610                                        partition_name(rdev->dev));
1611                                continue;
1612                        }
1613                        if (disk->operational) {
1614                                printk(ALREADY_RUNNING,
1615                                        partition_name(rdev->dev),
1616                                        disk_idx);
1617                                continue;
1618                        }
1619                        printk(OPERATIONAL, partition_name(rdev->dev),
1620                                        disk_idx);
1621                        disk->number = descriptor->number;
1622                        disk->raid_disk = disk_idx;
1623                        disk->dev = rdev->dev;
1624                        disk->sect_limit = MAX_WORK_PER_DISK;
1625                        disk->operational = 1;
1626                        disk->write_only = 0;
1627                        disk->spare = 0;
1628                        disk->used_slot = 1;
1629                        disk->head_position = 0;
1630                        conf->working_disks++;
1631                } else {
1632                /*
1633                 * Must be a spare disk ..
1634                 */
1635                        printk(SPARE, partition_name(rdev->dev));
1636                        disk->number = descriptor->number;
1637                        disk->raid_disk = disk_idx;
1638                        disk->dev = rdev->dev;
1639                        disk->sect_limit = MAX_WORK_PER_DISK;
1640                        disk->operational = 0;
1641                        disk->write_only = 0;
1642                        disk->spare = 1;
1643                        disk->used_slot = 1;
1644                        disk->head_position = 0;
1645                }
1646        }
1647        conf->raid_disks = sb->raid_disks;
1648        conf->nr_disks = sb->nr_disks;
1649        conf->mddev = mddev;
1650        conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1651
1652        conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1653        init_waitqueue_head(&conf->wait_buffer);
1654        init_waitqueue_head(&conf->wait_done);
1655        init_waitqueue_head(&conf->wait_ready);
1656
1657        if (!conf->working_disks) {
1658                printk(NONE_OPERATIONAL, mdidx(mddev));
1659                goto out_free_conf;
1660        }
1661
1662
1663        /* pre-allocate some buffer_head structures.
1664         * As a minimum, 1 r1bh and raid_disks buffer_heads
1665         * would probably get us by in tight memory situations,
1666         * but a few more is probably a good idea.
1667         * For now, try NR_RESERVED_BUFS r1bh and
1668         * NR_RESERVED_BUFS*raid_disks bufferheads
1669         * This will allow at least NR_RESERVED_BUFS concurrent
1670         * reads or writes even if kmalloc starts failing
1671         */
1672        if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1673            raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1674                              < NR_RESERVED_BUFS*conf->raid_disks) {
1675                printk(MEM_ERROR, mdidx(mddev));
1676                goto out_free_conf;
1677        }
1678
1679        for (i = 0; i < MD_SB_DISKS; i++) {
1680                
1681                descriptor = sb->disks+i;
1682                disk_idx = descriptor->raid_disk;
1683                disk = conf->mirrors + disk_idx;
1684
1685                if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1686                                !disk->used_slot) {
1687
1688                        disk->number = descriptor->number;
1689                        disk->raid_disk = disk_idx;
1690                        disk->dev = MKDEV(0,0);
1691
1692                        disk->operational = 0;
1693                        disk->write_only = 0;
1694                        disk->spare = 0;
1695                        disk->used_slot = 1;
1696                        disk->head_position = 0;
1697                }
1698        }
1699
1700        /*
1701         * find the first working one and use it as a starting point
1702         * to read balancing.
1703         */
1704        for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1705                /* nothing */;
1706        conf->last_used = j;
1707
1708
1709        if (conf->working_disks != sb->raid_disks) {
1710                printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1711                start_recovery = 1;
1712        }
1713
1714        {
1715                const char * name = "raid1d";
1716
1717                conf->thread = md_register_thread(raid1d, conf, name);
1718                if (!conf->thread) {
1719                        printk(THREAD_ERROR, mdidx(mddev));
1720                        goto out_free_conf;
1721                }
1722        }
1723
1724        if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1725            (conf->working_disks > 1)) {
1726                const char * name = "raid1syncd";
1727
1728                conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1729                if (!conf->resync_thread) {
1730                        printk(THREAD_ERROR, mdidx(mddev));
1731                        goto out_free_conf;
1732                }
1733
1734                printk(START_RESYNC, mdidx(mddev));
1735                conf->resync_mirrors = 1;
1736                md_wakeup_thread(conf->resync_thread);
1737        }
1738
1739        /*
1740         * Regenerate the "device is in sync with the raid set" bit for
1741         * each device.
1742         */
1743        for (i = 0; i < MD_SB_DISKS; i++) {
1744                mark_disk_nonsync(sb->disks+i);
1745                for (j = 0; j < sb->raid_disks; j++) {
1746                        if (!conf->mirrors[j].operational)
1747                                continue;
1748                        if (sb->disks[i].number == conf->mirrors[j].number)
1749                                mark_disk_sync(sb->disks+i);
1750                }
1751        }
1752        sb->active_disks = conf->working_disks;
1753
1754        if (start_recovery)
1755                md_recover_arrays();
1756
1757
1758        printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1759        /*
1760         * Ok, everything is just fine now
1761         */
1762        return 0;
1763
1764out_free_conf:
1765        raid1_shrink_r1bh(conf);
1766        raid1_shrink_bh(conf);
1767        raid1_shrink_buffers(conf);
1768        kfree(conf);
1769        mddev->private = NULL;
1770out:
1771        MOD_DEC_USE_COUNT;
1772        return -EIO;
1773}
1774
1775#undef INVALID_LEVEL
1776#undef NO_SB
1777#undef ERRORS
1778#undef NOT_IN_SYNC
1779#undef INCONSISTENT
1780#undef ALREADY_RUNNING
1781#undef OPERATIONAL
1782#undef SPARE
1783#undef NONE_OPERATIONAL
1784#undef ARRAY_IS_ACTIVE
1785
1786static int raid1_stop_resync (mddev_t *mddev)
1787{
1788        raid1_conf_t *conf = mddev_to_conf(mddev);
1789
1790        if (conf->resync_thread) {
1791                if (conf->resync_mirrors) {
1792                        conf->resync_mirrors = 2;
1793                        md_interrupt_thread(conf->resync_thread);
1794
1795                        printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1796                        return 1;
1797                }
1798                return 0;
1799        }
1800        return 0;
1801}
1802
1803static int raid1_restart_resync (mddev_t *mddev)
1804{
1805        raid1_conf_t *conf = mddev_to_conf(mddev);
1806
1807        if (conf->resync_mirrors) {
1808                if (!conf->resync_thread) {
1809                        MD_BUG();
1810                        return 0;
1811                }
1812                conf->resync_mirrors = 1;
1813                md_wakeup_thread(conf->resync_thread);
1814                return 1;
1815        }
1816        return 0;
1817}
1818
1819static int raid1_stop (mddev_t *mddev)
1820{
1821        raid1_conf_t *conf = mddev_to_conf(mddev);
1822
1823        md_unregister_thread(conf->thread);
1824        if (conf->resync_thread)
1825                md_unregister_thread(conf->resync_thread);
1826        raid1_shrink_r1bh(conf);
1827        raid1_shrink_bh(conf);
1828        raid1_shrink_buffers(conf);
1829        kfree(conf);
1830        mddev->private = NULL;
1831        MOD_DEC_USE_COUNT;
1832        return 0;
1833}
1834
1835static mdk_personality_t raid1_personality=
1836{
1837        name:           "raid1",
1838        make_request:   raid1_make_request,
1839        run:            raid1_run,
1840        stop:           raid1_stop,
1841        status:         raid1_status,
1842        error_handler:  raid1_error,
1843        diskop:         raid1_diskop,
1844        stop_resync:    raid1_stop_resync,
1845        restart_resync: raid1_restart_resync,
1846        sync_request:   raid1_sync_request
1847};
1848
1849static int md__init raid1_init (void)
1850{
1851        return register_md_personality (RAID1, &raid1_personality);
1852}
1853
1854static void raid1_exit (void)
1855{
1856        unregister_md_personality (RAID1);
1857}
1858
1859module_init(raid1_init);
1860module_exit(raid1_exit);
1861MODULE_LICENSE("GPL");
1862
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.