diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 21692083bd32..424b8dca4e6a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1892,9 +1892,10 @@ public void visitRegionState(Result result, final RegionInfo regionInfo, final S } // add regions to RIT while visiting the meta regionInTransitionTracker.handleRegionStateNodeOperation(regionNode); - // If region location of region belongs to a dead server mark the region crashed + // If region is supposed to be serve traffic (NOT split and merged) and location of region + // belongs to a dead server mark the region crashed if ( - regionNode.getRegionLocation() != null + regionNode.getRegionLocation() != null && !AssignmentManagerUtil.isSplitOrMerged(regionNode) && master.getServerManager().isServerDead(regionNode.getRegionLocation()) ) { long timeOfCrash = master.getServerManager().getDeadServers() @@ -2116,10 +2117,6 @@ public List getRegionsInTransition() { return regionInTransitionTracker.getRegionsInTransition(); } - public boolean isRegionInTransition(final RegionInfo regionInfo) { - return regionInTransitionTracker.isRegionInTransition(regionInfo); - } - public int getRegionTransitScheduledCount() { return regionStates.getRegionTransitScheduledCount(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java index 3f8b50e0913b..8227856c85bf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManagerUtil.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.favored.FavoredNodesManager; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.util.FutureUtils; import org.apache.hadoop.hbase.wal.WALSplitUtil; @@ -298,4 +299,16 @@ static void checkClosedRegion(MasterProcedureEnv env, RegionInfo regionInfo) thr + ", abort split/merge to prevent data loss"); } } + + /** + * For splitting, need to test both region info and state, and will return true if either of the + * test returns true. Please see the comments in + * {@link AssignmentManager#markRegionAsSplit(RegionInfo, ServerName, RegionInfo, RegionInfo)} for + * more details on why we need to test two conditions. + */ + static boolean isSplitOrMerged(RegionStateNode regionStateNode) { + return regionStateNode.getState() == RegionState.State.SPLIT + || regionStateNode.getRegionInfo().isSplit() + || regionStateNode.getState() == RegionState.State.MERGED; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java index b5e0f03e4d75..de79cde70bb1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java @@ -41,22 +41,22 @@ public class RegionInTransitionTracker { private final List ENABLE_TABLE_REGION_STATE = List.of(RegionState.State.OPEN); + // DO NOT USE containsKey()/remove() on regionInTransition with a different RegionInfo instance: + // this map is ordered by RegionInfo.COMPARATOR, and that comparator includes the offline flag. + // Lookups can therefore fail if the RegionInfo used as the key has a different offline value, + // even when it refers to the same region. Offline value changes with splitting. private final ConcurrentSkipListMap regionInTransition = new ConcurrentSkipListMap<>(RegionInfo.COMPARATOR); private TableStateManager tableStateManager; - public boolean isRegionInTransition(final RegionInfo regionInfo) { - return regionInTransition.containsKey(regionInfo); - } - /** * Handles a region whose hosting RegionServer has crashed. When a RegionServer fails, all regions * it was hosting are automatically added to the RIT list since they need to be reassigned to * other servers. */ public void regionCrashed(RegionStateNode regionStateNode) { - if (regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { + if (isReplica(regionStateNode)) { return; } @@ -76,7 +76,7 @@ public void regionCrashed(RegionStateNode regionStateNode) { */ public void handleRegionStateNodeOperation(RegionStateNode regionStateNode) { // only consider default replica for availability - if (regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { + if (isReplica(regionStateNode)) { return; } @@ -86,10 +86,7 @@ public void handleRegionStateNodeOperation(RegionStateNode regionStateNode) { tableEnabled ? ENABLE_TABLE_REGION_STATE : DISABLE_TABLE_REGION_STATE; // if region is merged or split it should not be in RIT list - if ( - currentState == RegionState.State.SPLIT || currentState == RegionState.State.MERGED - || regionStateNode.getRegionInfo().isSplit() - ) { + if (AssignmentManagerUtil.isSplitOrMerged(regionStateNode)) { if (removeRegionInTransition(regionStateNode.getRegionInfo())) { LOG.debug("Removed {} from RIT list as it is split or merged", regionStateNode.getRegionInfo().getEncodedName()); @@ -153,4 +150,9 @@ public List getRegionsInTransition() { public void setTableStateManager(TableStateManager tableStateManager) { this.tableStateManager = tableStateManager; } + + private static boolean isReplica(RegionStateNode regionStateNode) { + return regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID; + } + } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/AssignmentTestingUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/AssignmentTestingUtil.java index b63f02d2ff7a..9b8ba58f9e86 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/AssignmentTestingUtil.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/AssignmentTestingUtil.java @@ -49,12 +49,17 @@ private AssignmentTestingUtil() { } public static void waitForRegionToBeInTransition(final HBaseTestingUtil util, - final RegionInfo hri) throws Exception { - while (!getMaster(util).getAssignmentManager().isRegionInTransition(hri)) { + final RegionInfo hri) { + while (!isRegionInTransition(hri, getMaster(util).getAssignmentManager())) { Threads.sleep(10); } } + public static boolean isRegionInTransition(RegionInfo hri, AssignmentManager am) { + return am.getRegionsInTransition().stream() + .anyMatch(rsn -> rsn.getRegionInfo().getEncodedName().equals(hri.getEncodedName())); + } + public static void waitForRsToBeDead(final HBaseTestingUtil util, final ServerName serverName) throws Exception { util.waitFor(60000, new ExplainingPredicate() { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionSplit.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionSplit.java index 13927082d175..43282e430626 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionSplit.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionSplit.java @@ -19,6 +19,7 @@ import static org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil.insertData; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -117,8 +118,8 @@ public void testSplitTableRegion() throws Exception { int splitRowNum = startRowNum + rowCount / 2; byte[] splitKey = Bytes.toBytes("" + splitRowNum); - assertTrue(regions != null, "not able to find a splittable region"); - assertTrue(regions.length == 1, "not able to find a splittable region"); + assertNotNull(regions, "not able to find a splittable region"); + assertEquals(1, regions.length, "not able to find a splittable region"); // Split region of the table long procId = procExec.submitProcedure( @@ -127,7 +128,7 @@ public void testSplitTableRegion() throws Exception { ProcedureTestingUtility.waitProcedure(procExec, procId); ProcedureTestingUtility.assertProcNotFailed(procExec, procId); - assertTrue(UTIL.getHBaseCluster().getRegions(tableName).size() == 2, "not able to split table"); + assertEquals(2, UTIL.getHBaseCluster().getRegions(tableName).size(), "not able to split table"); // disable table UTIL.getAdmin().disableTable(tableName); @@ -155,6 +156,57 @@ public void testSplitTableRegion() throws Exception { regionInfoMap.get(tableRegions.get(1).getRegionInfo())); } + @Test + public void testRITWithSplitTableRegion() throws Exception { + final TableName tableName = TableName.valueOf(testMethodName); + final ProcedureExecutor procExec = getMasterProcedureExecutor(); + + RegionInfo[] regions = + MasterProcedureTestingUtility.createTable(procExec, tableName, null, columnFamilyName); + insertData(UTIL, tableName, rowCount, startRowNum, columnFamilyName); + int splitRowNum = startRowNum + rowCount / 2; + byte[] splitKey = Bytes.toBytes("" + splitRowNum); + + assertNotNull(regions, "not able to find a splittable region"); + assertEquals(1, regions.length, "not able to find a splittable region"); + assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0], + UTIL.getHBaseCluster().getMaster().getAssignmentManager())); + + ServerName targetRS = UTIL.getHBaseCluster().getMaster().getAssignmentManager() + .getRegionStates().getRegionServerOfRegion(regions[0]); + // Split region of the table + long procId = procExec.submitProcedure( + new SplitTableRegionProcedure(procExec.getEnvironment(), regions[0], splitKey)); + // Wait the completion + ProcedureTestingUtility.waitProcedure(procExec, procId); + ProcedureTestingUtility.assertProcNotFailed(procExec, procId); + + assertEquals(2, UTIL.getHBaseCluster().getRegions(tableName).size(), "not able to split table"); + assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0], + UTIL.getHBaseCluster().getMaster().getAssignmentManager())); + // As there are only 3 RS, start one more RS before expiring one + UTIL.getHBaseCluster().startRegionServer(); + + // We don't want SCP to complete so kill PR it after store update + ProcedureTestingUtility + .toggleKillAfterStoreUpdate(UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor()); + // stop RS holding split parent to create SCP and add RS into deadServerList + UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(targetRS); + + // stop master + UTIL.getHBaseCluster().stopMaster(0); + UTIL.getHBaseCluster().waitOnMaster(0); + + // restart master + UTIL.getHBaseCluster().startMaster(); + assertTrue(UTIL.getHBaseCluster().waitForActiveAndReadyMaster(30000), + "Master failed to initialize in in 30 seconds"); + UTIL.invalidateConnection(); + + assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0], + UTIL.getHBaseCluster().getMaster().getAssignmentManager())); + } + @Test public void testSplitStoreFiles() throws Exception { final TableName tableName = TableName.valueOf(testMethodName); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 6aa195123062..e98ad9189b00 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -907,7 +907,8 @@ public void testSplitRegionWithNoStoreFiles() throws Exception { } catch (DoNotRetryIOException e) { // Expected } - assertFalse(am.isRegionInTransition(hri), "Split region can't be assigned"); + assertFalse(AssignmentTestingUtil.isRegionInTransition(hri, am), + "Split region can't be assigned"); assertTrue(regionStates.isRegionInState(hri, State.SPLIT)); // We should not be able to unassign it either @@ -917,7 +918,8 @@ public void testSplitRegionWithNoStoreFiles() throws Exception { } catch (DoNotRetryIOException e) { // Expected } - assertFalse(am.isRegionInTransition(hri), "Split region can't be unassigned"); + assertFalse(AssignmentTestingUtil.isRegionInTransition(hri, am), + "Split region can't be unassigned"); assertTrue(regionStates.isRegionInState(hri, State.SPLIT)); } finally { admin.balancerSwitch(true, false);