View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertTrue;
23  
24  import java.io.IOException;
25  import java.util.List;
26  import java.util.Map;
27  import java.util.concurrent.atomic.AtomicBoolean;
28  import java.util.concurrent.atomic.AtomicReference;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.conf.Configuration;
33  import org.apache.hadoop.hbase.CategoryBasedTimeout;
34  import org.apache.hadoop.hbase.CoordinatedStateManager;
35  import org.apache.hadoop.hbase.HBaseConfiguration;
36  import org.apache.hadoop.hbase.HBaseTestingUtility;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.LocalHBaseCluster;
39  import org.apache.hadoop.hbase.MiniHBaseCluster;
40  import org.apache.hadoop.hbase.ServerName;
41  import org.apache.hadoop.hbase.master.HMaster;
42  import org.apache.hadoop.hbase.master.ServerListener;
43  import org.apache.hadoop.hbase.master.ServerManager;
44  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
45  import org.apache.hadoop.hbase.testclassification.MediumTests;
46  import org.apache.hadoop.hbase.testclassification.RegionServerTests;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
49  import org.apache.hadoop.hbase.util.Threads;
50  import org.junit.Ignore;
51  import org.junit.Rule;
52  import org.junit.Test;
53  import org.junit.experimental.categories.Category;
54  import org.junit.rules.TestName;
55  import org.junit.rules.TestRule;
56  
57  /**
58   * Tests that a regionserver that dies after reporting for duty gets removed
59   * from list of online regions. See HBASE-9593.
60   */
61  @Category({RegionServerTests.class, MediumTests.class})
62  @Ignore("Flaky, see HBASE-18346")
63  public class TestRSKilledWhenInitializing {
64    private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
65    @Rule public TestName testName = new TestName();
66    @Rule public final TestRule timeout = CategoryBasedTimeout.builder().
67      withTimeout(this.getClass()).withLookingForStuckThread(true).build();
68  
69    // This boolean needs to be globally available. It is used below in our
70    // mocked up regionserver so it knows when to die.
71    private static AtomicBoolean masterActive = new AtomicBoolean(false);
72    // Ditto for this variable. It also is used in the mocked regionserver class.
73    private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>();
74  
75    private static final int NUM_MASTERS = 1;
76    private static final int NUM_RS = 2;
77  
78    /**
79     * Test verifies whether a region server is removing from online servers list in master if it went
80     * down after registering with master. Test will TIMEOUT if an error!!!!
81     * @throws Exception
82     */
83    @Test
84    public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode()
85    throws Exception {
86      // Create config to use for this cluster
87      Configuration conf = HBaseConfiguration.create();
88      conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
89      // Start the cluster
90      final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
91      TEST_UTIL.startMiniDFSCluster(3);
92      TEST_UTIL.startMiniZKCluster();
93      TEST_UTIL.createRootDir();
94      final LocalHBaseCluster cluster =
95          new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class,
96              RegisterAndDieRegionServer.class);
97      final MasterThread master = startMaster(cluster.getMasters().get(0));
98      try {
99        // Master is up waiting on RegionServers to check in. Now start RegionServers.
100       for (int i = 0; i < NUM_RS; i++) {
101         cluster.getRegionServers().get(i).start();
102       }
103       // Now wait on master to see NUM_RS + 1 servers as being online, thats NUM_RS plus
104       // the Master itself (because Master hosts hbase:meta and checks in as though it a RS).
105       List<ServerName> onlineServersList = null;
106       do {
107         onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
108       } while (onlineServersList.size() < NUM_RS);
109       // Wait until killedRS is set. Means RegionServer is starting to go down.
110       while (killedRS.get() == null) {
111         Threads.sleep(1);
112       }
113       // Wait on the RegionServer to fully die.
114       while (cluster.getLiveRegionServers().size() >= NUM_RS) {
115         Threads.sleep(1);
116       }
117       // Make sure Master is fully up before progressing. Could take a while if regions
118       // being reassigned.
119       while (!master.getMaster().isInitialized()) {
120         Threads.sleep(1);
121       }
122 
123       // Now in steady state. Make sure the killed RS is no longer registered.
124       // branch-1 works differently to master branch.
125       assertTrue(!master.getMaster().getServerManager().isServerOnline(killedRS.get()));
126     } finally {
127       cluster.shutdown();
128       cluster.join();
129       TEST_UTIL.shutdownMiniDFSCluster();
130       TEST_UTIL.shutdownMiniZKCluster();
131       TEST_UTIL.cleanupTestDir();
132     }
133   }
134 
135   /**
136    * Start Master. Get as far as the state where Master is waiting on
137    * RegionServers to check in, then return.
138    */
139   private MasterThread startMaster(MasterThread master) {
140     master.start();
141     // It takes a while until ServerManager creation to happen inside Master startup.
142     while (master.getMaster().getServerManager() == null) {
143       continue;
144     }
145     // Set a listener for the waiting-on-RegionServers state. We want to wait
146     // until this condition before we leave this method and start regionservers.
147     final AtomicBoolean waiting = new AtomicBoolean(false);
148     if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM");
149     master.getMaster().getServerManager().registerListener(new ServerListener() {
150       @Override
151       public void waiting() {
152         waiting.set(true);
153       }
154 
155       @Override
156       public void serverAdded(ServerName serverName) {
157         // TODO Auto-generated method stub
158       }
159 
160       @Override
161       public void serverRemoved(ServerName serverName) {
162         // TODO Auto-generated method stub
163       }
164     });
165     // Wait until the Master gets to place where it is waiting on RegionServers to check in.
166     while (!waiting.get()) {
167       continue;
168     }
169     // Set the global master-is-active; gets picked up by regionservers later.
170     masterActive.set(true);
171     return master;
172   }
173 
174   /**
175    * A RegionServer that reports for duty and then immediately dies if it is the first to receive
176    * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master
177    * notices and so removes the region from its set of online regionservers.
178    */
179   static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
180     public RegisterAndDieRegionServer(Configuration conf, CoordinatedStateManager cp)
181     throws IOException, InterruptedException {
182       super(conf, cp);
183     }
184 
185     @Override
186     protected void handleReportForDutyResponse(RegionServerStartupResponse c)
187     throws IOException {
188       if (killedRS.compareAndSet(null, getServerName())) {
189         // Make sure Master is up so it will see the removal of the ephemeral znode for this RS.
190         while (!masterActive.get()) {
191           Threads.sleep(100);
192         }
193         super.kill();
194       } else {
195         super.handleReportForDutyResponse(c);
196       }
197     }
198   }
199 }