@@ -730,74 +730,71 @@ static bool should_choose_next(struct r1conf *conf, int disk)
730730 mirror -> next_seq_sect - opt_iosize >= mirror -> seq_start ;
731731}
732732
733- /*
734- * This routine returns the disk from which the requested read should
735- * be done. There is a per-array 'next expected sequential IO' sector
736- * number - if this matches on the next IO then we use the last disk.
737- * There is also a per-disk 'last know head position' sector that is
738- * maintained from IRQ contexts, both the normal and the resync IO
739- * completion handlers update this position correctly. If there is no
740- * perfect sequential match then we pick the disk whose head is closest.
741- *
742- * If there are 2 mirrors in the same 2 devices, performance degrades
743- * because position is mirror, not device based.
744- *
745- * The rdev for the device selected will have nr_pending incremented.
746- */
747- static int read_balance (struct r1conf * conf , struct r1bio * r1_bio , int * max_sectors )
733+ static bool rdev_readable (struct md_rdev * rdev , struct r1bio * r1_bio )
748734{
749- const sector_t this_sector = r1_bio -> sector ;
750- int sectors ;
751- int best_good_sectors ;
752- int best_disk , best_dist_disk , best_pending_disk , sequential_disk ;
753- int disk ;
754- sector_t best_dist ;
755- unsigned int min_pending ;
756- struct md_rdev * rdev ;
735+ if (!rdev || test_bit (Faulty , & rdev -> flags ))
736+ return false;
757737
758- retry :
759- sectors = r1_bio -> sectors ;
760- best_disk = -1 ;
761- best_dist_disk = -1 ;
762- sequential_disk = -1 ;
763- best_dist = MaxSector ;
764- best_pending_disk = -1 ;
765- min_pending = UINT_MAX ;
766- best_good_sectors = 0 ;
767- clear_bit (R1BIO_FailFast , & r1_bio -> state );
738+ /* still in recovery */
739+ if (!test_bit (In_sync , & rdev -> flags ) &&
740+ rdev -> recovery_offset < r1_bio -> sector + r1_bio -> sectors )
741+ return false;
768742
769- if (raid1_should_read_first (conf -> mddev , this_sector , sectors ))
770- return choose_first_rdev (conf , r1_bio , max_sectors );
743+ /* don't read from slow disk unless have to */
744+ if (test_bit (WriteMostly , & rdev -> flags ))
745+ return false;
746+
747+ /* don't split IO for bad blocks unless have to */
748+ if (rdev_has_badblock (rdev , r1_bio -> sector , r1_bio -> sectors ))
749+ return false;
750+
751+ return true;
752+ }
753+
754+ struct read_balance_ctl {
755+ sector_t closest_dist ;
756+ int closest_dist_disk ;
757+ int min_pending ;
758+ int min_pending_disk ;
759+ int sequential_disk ;
760+ int readable_disks ;
761+ };
762+
763+ static int choose_best_rdev (struct r1conf * conf , struct r1bio * r1_bio )
764+ {
765+ int disk ;
766+ struct read_balance_ctl ctl = {
767+ .closest_dist_disk = -1 ,
768+ .closest_dist = MaxSector ,
769+ .min_pending_disk = -1 ,
770+ .min_pending = UINT_MAX ,
771+ .sequential_disk = -1 ,
772+ };
771773
772774 for (disk = 0 ; disk < conf -> raid_disks * 2 ; disk ++ ) {
775+ struct md_rdev * rdev ;
773776 sector_t dist ;
774777 unsigned int pending ;
775778
776- rdev = conf -> mirrors [disk ].rdev ;
777- if (r1_bio -> bios [disk ] == IO_BLOCKED
778- || rdev == NULL
779- || test_bit (Faulty , & rdev -> flags ))
780- continue ;
781- if (!test_bit (In_sync , & rdev -> flags ) &&
782- rdev -> recovery_offset < this_sector + sectors )
783- continue ;
784- if (test_bit (WriteMostly , & rdev -> flags ))
779+ if (r1_bio -> bios [disk ] == IO_BLOCKED )
785780 continue ;
786- if (rdev_has_badblock (rdev , this_sector , sectors ))
781+
782+ rdev = conf -> mirrors [disk ].rdev ;
783+ if (!rdev_readable (rdev , r1_bio ))
787784 continue ;
788785
789- if ( best_disk >= 0 )
790- /* At least two disks to choose from so failfast is OK */
786+ /* At least two disks to choose from so failfast is OK */
787+ if ( ctl . readable_disks ++ == 1 )
791788 set_bit (R1BIO_FailFast , & r1_bio -> state );
792789
793790 pending = atomic_read (& rdev -> nr_pending );
794- dist = abs (this_sector - conf -> mirrors [disk ].head_position );
791+ dist = abs (r1_bio -> sector - conf -> mirrors [disk ].head_position );
792+
795793 /* Don't change to another disk for sequential reads */
796794 if (is_sequential (conf , disk , r1_bio )) {
797- if (!should_choose_next (conf , disk )) {
798- best_disk = disk ;
799- break ;
800- }
795+ if (!should_choose_next (conf , disk ))
796+ return disk ;
797+
801798 /*
802799 * Add 'pending' to avoid choosing this disk if
803800 * there is other idle disk.
@@ -807,52 +804,76 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
807804 * If there is no other idle disk, this disk
808805 * will be chosen.
809806 */
810- sequential_disk = disk ;
807+ ctl . sequential_disk = disk ;
811808 }
812809
813- if (min_pending > pending ) {
814- min_pending = pending ;
815- best_pending_disk = disk ;
810+ if (ctl . min_pending > pending ) {
811+ ctl . min_pending = pending ;
812+ ctl . min_pending_disk = disk ;
816813 }
817814
818- if (dist < best_dist ) {
819- best_dist = dist ;
820- best_dist_disk = disk ;
815+ if (ctl . closest_dist > dist ) {
816+ ctl . closest_dist = dist ;
817+ ctl . closest_dist_disk = disk ;
821818 }
822819 }
823820
824821 /*
825822 * sequential IO size exceeds optimal iosize, however, there is no other
826823 * idle disk, so choose the sequential disk.
827824 */
828- if (best_disk == -1 && min_pending != 0 )
829- best_disk = sequential_disk ;
825+ if (ctl . sequential_disk != -1 && ctl . min_pending != 0 )
826+ return ctl . sequential_disk ;
830827
831828 /*
832829 * If all disks are rotational, choose the closest disk. If any disk is
833830 * non-rotational, choose the disk with less pending request even the
834831 * disk is rotational, which might/might not be optimal for raids with
835832 * mixed ratation/non-rotational disks depending on workload.
836833 */
837- if (best_disk == -1 ) {
838- if (READ_ONCE (conf -> nonrot_disks ) || min_pending == 0 )
839- best_disk = best_pending_disk ;
840- else
841- best_disk = best_dist_disk ;
842- }
834+ if (ctl . min_pending_disk != -1 &&
835+ (READ_ONCE (conf -> nonrot_disks ) || ctl . min_pending == 0 ) )
836+ return ctl . min_pending_disk ;
837+ else
838+ return ctl . closest_dist_disk ;
839+ }
843840
844- if (best_disk >= 0 ) {
845- rdev = conf -> mirrors [best_disk ].rdev ;
846- if (!rdev )
847- goto retry ;
841+ /*
842+ * This routine returns the disk from which the requested read should be done.
843+ *
844+ * 1) If resync is in progress, find the first usable disk and use it even if it
845+ * has some bad blocks.
846+ *
847+ * 2) Now that there is no resync, loop through all disks and skipping slow
848+ * disks and disks with bad blocks for now. Only pay attention to key disk
849+ * choice.
850+ *
851+ * 3) If we've made it this far, now look for disks with bad blocks and choose
852+ * the one with most number of sectors.
853+ *
854+ * 4) If we are all the way at the end, we have no choice but to use a disk even
855+ * if it is write mostly.
856+ *
857+ * The rdev for the device selected will have nr_pending incremented.
858+ */
859+ static int read_balance (struct r1conf * conf , struct r1bio * r1_bio ,
860+ int * max_sectors )
861+ {
862+ int disk ;
848863
849- sectors = best_good_sectors ;
850- update_read_sectors (conf , disk , this_sector , sectors );
851- }
852- * max_sectors = sectors ;
864+ clear_bit (R1BIO_FailFast , & r1_bio -> state );
865+
866+ if (raid1_should_read_first (conf -> mddev , r1_bio -> sector ,
867+ r1_bio -> sectors ))
868+ return choose_first_rdev (conf , r1_bio , max_sectors );
853869
854- if (best_disk >= 0 )
855- return best_disk ;
870+ disk = choose_best_rdev (conf , r1_bio );
871+ if (disk >= 0 ) {
872+ * max_sectors = r1_bio -> sectors ;
873+ update_read_sectors (conf , disk , r1_bio -> sector ,
874+ r1_bio -> sectors );
875+ return disk ;
876+ }
856877
857878 /*
858879 * If we are here it means we didn't find a perfectly good disk so
0 commit comments