Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1892,9 +1892,10 @@ public void visitRegionState(Result result, final RegionInfo regionInfo, final S
}
// add regions to RIT while visiting the meta
regionInTransitionTracker.handleRegionStateNodeOperation(regionNode);
// If region location of region belongs to a dead server mark the region crashed
// If region is supposed to be serve traffic (NOT split and merged) and location of region
// belongs to a dead server mark the region crashed
if (
regionNode.getRegionLocation() != null
regionNode.getRegionLocation() != null && !AssignmentManagerUtil.isSplitOrMerged(regionNode)
&& master.getServerManager().isServerDead(regionNode.getRegionLocation())
) {
long timeOfCrash = master.getServerManager().getDeadServers()
Expand Down Expand Up @@ -2116,10 +2117,6 @@ public List<RegionStateNode> getRegionsInTransition() {
return regionInTransitionTracker.getRegionsInTransition();
}

public boolean isRegionInTransition(final RegionInfo regionInfo) {
return regionInTransitionTracker.isRegionInTransition(regionInfo);
}

public int getRegionTransitScheduledCount() {
return regionStates.getRegionTransitScheduledCount();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.util.FutureUtils;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
Expand Down Expand Up @@ -298,4 +299,16 @@ static void checkClosedRegion(MasterProcedureEnv env, RegionInfo regionInfo) thr
+ ", abort split/merge to prevent data loss");
}
}

/**
* For splitting, need to test both region info and state, and will return true if either of the
* test returns true. Please see the comments in
* {@link AssignmentManager#markRegionAsSplit(RegionInfo, ServerName, RegionInfo, RegionInfo)} for
* more details on why we need to test two conditions.
*/
static boolean isSplitOrMerged(RegionStateNode regionStateNode) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An AI assisted review finds:

This check isSplitOrMerged has been moved out of regionCrashed , where it formerly guarded calls to both visitRegionState and markRegionsAsCrashed. After this refactor, now only visitRegionState is guarded. A split parent in SPLIT state whose location is still equal to the crashed server (the parent stays in regionStates in this state until the catalog janitor cleans it) will now be processed in regionCrashed when this didn't used to be possible.

Recommendation:

Add & !AssignmentManagerUtil.isSplitOrMerged(node) to the if (crashedServerName.equals(node.getRegionLocation())) branch in markRegionsAsCrashed, just like the same check guard visitRegionState.

return regionStateNode.getState() == RegionState.State.SPLIT
|| regionStateNode.getRegionInfo().isSplit()
|| regionStateNode.getState() == RegionState.State.MERGED;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,22 @@ public class RegionInTransitionTracker {

private final List<RegionState.State> ENABLE_TABLE_REGION_STATE = List.of(RegionState.State.OPEN);

// DO NOT USE containsKey()/remove() on regionInTransition with a different RegionInfo instance:
// this map is ordered by RegionInfo.COMPARATOR, and that comparator includes the offline flag.
// Lookups can therefore fail if the RegionInfo used as the key has a different offline value,
// even when it refers to the same region. Offline value changes with splitting.
private final ConcurrentSkipListMap<RegionInfo, RegionStateNode> regionInTransition =
new ConcurrentSkipListMap<>(RegionInfo.COMPARATOR);

private TableStateManager tableStateManager;

public boolean isRegionInTransition(final RegionInfo regionInfo) {
return regionInTransition.containsKey(regionInfo);
}

/**
* Handles a region whose hosting RegionServer has crashed. When a RegionServer fails, all regions
* it was hosting are automatically added to the RIT list since they need to be reassigned to
* other servers.
*/
public void regionCrashed(RegionStateNode regionStateNode) {
if (regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) {
if (isReplica(regionStateNode)) {
return;
}

Expand All @@ -76,7 +76,7 @@ public void regionCrashed(RegionStateNode regionStateNode) {
*/
public void handleRegionStateNodeOperation(RegionStateNode regionStateNode) {
// only consider default replica for availability
if (regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) {
if (isReplica(regionStateNode)) {
return;
}

Expand All @@ -86,10 +86,7 @@ public void handleRegionStateNodeOperation(RegionStateNode regionStateNode) {
tableEnabled ? ENABLE_TABLE_REGION_STATE : DISABLE_TABLE_REGION_STATE;

// if region is merged or split it should not be in RIT list
if (
currentState == RegionState.State.SPLIT || currentState == RegionState.State.MERGED
|| regionStateNode.getRegionInfo().isSplit()
) {
if (AssignmentManagerUtil.isSplitOrMerged(regionStateNode)) {
if (removeRegionInTransition(regionStateNode.getRegionInfo())) {
LOG.debug("Removed {} from RIT list as it is split or merged",
regionStateNode.getRegionInfo().getEncodedName());
Expand Down Expand Up @@ -153,4 +150,9 @@ public List<RegionStateNode> getRegionsInTransition() {
public void setTableStateManager(TableStateManager tableStateManager) {
this.tableStateManager = tableStateManager;
}

private static boolean isReplica(RegionStateNode regionStateNode) {
return regionStateNode.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ private AssignmentTestingUtil() {
}

public static void waitForRegionToBeInTransition(final HBaseTestingUtil util,
final RegionInfo hri) throws Exception {
while (!getMaster(util).getAssignmentManager().isRegionInTransition(hri)) {
final RegionInfo hri) {
while (!isRegionInTransition(hri, getMaster(util).getAssignmentManager())) {
Threads.sleep(10);
Comment thread
apurtell marked this conversation as resolved.
}
}

public static boolean isRegionInTransition(RegionInfo hri, AssignmentManager am) {
return am.getRegionsInTransition().stream()
.anyMatch(rsn -> rsn.getRegionInfo().getEncodedName().equals(hri.getEncodedName()));
}

public static void waitForRsToBeDead(final HBaseTestingUtil util, final ServerName serverName)
throws Exception {
util.waitFor(60000, new ExplainingPredicate<Exception>() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import static org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil.insertData;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

Expand Down Expand Up @@ -117,8 +118,8 @@ public void testSplitTableRegion() throws Exception {
int splitRowNum = startRowNum + rowCount / 2;
byte[] splitKey = Bytes.toBytes("" + splitRowNum);

assertTrue(regions != null, "not able to find a splittable region");
assertTrue(regions.length == 1, "not able to find a splittable region");
assertNotNull(regions, "not able to find a splittable region");
assertEquals(1, regions.length, "not able to find a splittable region");

// Split region of the table
long procId = procExec.submitProcedure(
Expand All @@ -127,7 +128,7 @@ public void testSplitTableRegion() throws Exception {
ProcedureTestingUtility.waitProcedure(procExec, procId);
ProcedureTestingUtility.assertProcNotFailed(procExec, procId);

assertTrue(UTIL.getHBaseCluster().getRegions(tableName).size() == 2, "not able to split table");
assertEquals(2, UTIL.getHBaseCluster().getRegions(tableName).size(), "not able to split table");

// disable table
UTIL.getAdmin().disableTable(tableName);
Expand Down Expand Up @@ -155,6 +156,57 @@ public void testSplitTableRegion() throws Exception {
regionInfoMap.get(tableRegions.get(1).getRegionInfo()));
}

@Test
public void testRITWithSplitTableRegion() throws Exception {
final TableName tableName = TableName.valueOf(testMethodName);
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();

RegionInfo[] regions =
MasterProcedureTestingUtility.createTable(procExec, tableName, null, columnFamilyName);
insertData(UTIL, tableName, rowCount, startRowNum, columnFamilyName);
int splitRowNum = startRowNum + rowCount / 2;
byte[] splitKey = Bytes.toBytes("" + splitRowNum);

assertNotNull(regions, "not able to find a splittable region");
assertEquals(1, regions.length, "not able to find a splittable region");
assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0],
UTIL.getHBaseCluster().getMaster().getAssignmentManager()));

ServerName targetRS = UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.getRegionStates().getRegionServerOfRegion(regions[0]);
// Split region of the table
long procId = procExec.submitProcedure(
new SplitTableRegionProcedure(procExec.getEnvironment(), regions[0], splitKey));
// Wait the completion
ProcedureTestingUtility.waitProcedure(procExec, procId);
ProcedureTestingUtility.assertProcNotFailed(procExec, procId);

assertEquals(2, UTIL.getHBaseCluster().getRegions(tableName).size(), "not able to split table");
assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0],
UTIL.getHBaseCluster().getMaster().getAssignmentManager()));
// As there are only 3 RS, start one more RS before expiring one
UTIL.getHBaseCluster().startRegionServer();

// We don't want SCP to complete so kill PR it after store update
ProcedureTestingUtility
.toggleKillAfterStoreUpdate(UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor());
// stop RS holding split parent to create SCP and add RS into deadServerList
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(targetRS);

// stop master
UTIL.getHBaseCluster().stopMaster(0);
UTIL.getHBaseCluster().waitOnMaster(0);

// restart master
UTIL.getHBaseCluster().startMaster();
assertTrue(UTIL.getHBaseCluster().waitForActiveAndReadyMaster(30000),
"Master failed to initialize in in 30 seconds");
UTIL.invalidateConnection();

assertFalse(AssignmentTestingUtil.isRegionInTransition(regions[0],
UTIL.getHBaseCluster().getMaster().getAssignmentManager()));
}

@Test
public void testSplitStoreFiles() throws Exception {
final TableName tableName = TableName.valueOf(testMethodName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,8 @@ public void testSplitRegionWithNoStoreFiles() throws Exception {
} catch (DoNotRetryIOException e) {
// Expected
}
assertFalse(am.isRegionInTransition(hri), "Split region can't be assigned");
assertFalse(AssignmentTestingUtil.isRegionInTransition(hri, am),
"Split region can't be assigned");
assertTrue(regionStates.isRegionInState(hri, State.SPLIT));

// We should not be able to unassign it either
Expand All @@ -917,7 +918,8 @@ public void testSplitRegionWithNoStoreFiles() throws Exception {
} catch (DoNotRetryIOException e) {
// Expected
}
assertFalse(am.isRegionInTransition(hri), "Split region can't be unassigned");
assertFalse(AssignmentTestingUtil.isRegionInTransition(hri, am),
"Split region can't be unassigned");
assertTrue(regionStates.isRegionInState(hri, State.SPLIT));
} finally {
admin.balancerSwitch(true, false);
Expand Down
Loading