View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.regionserver;
19  
20  import static org.junit.Assert.assertEquals;
21  import static org.junit.Assert.assertTrue;
22  
23  import java.io.IOException;
24  import java.io.StringWriter;
25  import org.apache.commons.lang.StringUtils;
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.conf.Configuration;
29  import org.apache.hadoop.hbase.CoordinatedStateManager;
30  import org.apache.hadoop.hbase.HBaseTestingUtility;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.LocalHBaseCluster;
33  import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer;
34  import org.apache.hadoop.hbase.ServerName;
35  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
36  import org.apache.hadoop.hbase.master.HMaster;
37  import org.apache.hadoop.hbase.master.ServerManager;
38  import org.apache.hadoop.hbase.testclassification.MediumTests;
39  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
40  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
41  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
42  import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
43  import org.apache.log4j.Appender;
44  import org.apache.log4j.Layout;
45  import org.apache.log4j.PatternLayout;
46  import org.apache.log4j.WriterAppender;
47  import org.apache.zookeeper.KeeperException;
48  import org.junit.After;
49  import org.junit.Before;
50  import org.junit.Test;
51  import org.junit.experimental.categories.Category;
52  
53  @Category(MediumTests.class)
54  public class TestRegionServerReportForDuty {
55  
56    private static final Log LOG = LogFactory.getLog(TestRegionServerReportForDuty.class);
57  
58    private static final long SLEEP_INTERVAL = 500;
59  
60    private HBaseTestingUtility testUtil;
61    private LocalHBaseCluster cluster;
62    private RegionServerThread rs;
63    private RegionServerThread rs2;
64    private MasterThread master;
65    private MasterThread backupMaster;
66  
67    @Before
68    public void setUp() throws Exception {
69      testUtil = new HBaseTestingUtility();
70      testUtil.startMiniDFSCluster(1);
71      testUtil.startMiniZKCluster(1);
72      testUtil.createRootDir();
73      cluster = new LocalHBaseCluster(testUtil.getConfiguration(), 0, 0);
74    }
75  
76    @After
77    public void tearDown() throws Exception {
78      cluster.shutdown();
79      cluster.join();
80      testUtil.shutdownMiniZKCluster();
81      testUtil.shutdownMiniDFSCluster();
82    }
83  
84    /**
85     * LogCapturer is similar to {@link org.apache.hadoop.test.GenericTestUtils.LogCapturer}
86     * except that this implementation has a default appender to the root logger.
87     * Hadoop 2.8+ supports the default appender in the LogCapture it ships and this can be replaced.
88     * TODO: This class can be removed after we upgrade Hadoop dependency.
89     */
90    static class LogCapturer {
91      private StringWriter sw = new StringWriter();
92      private WriterAppender appender;
93      private org.apache.log4j.Logger logger;
94  
95      LogCapturer(org.apache.log4j.Logger logger) {
96        this.logger = logger;
97        Appender defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("stdout");
98        if (defaultAppender == null) {
99          defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("console");
100       }
101       final Layout layout = (defaultAppender == null) ? new PatternLayout() :
102           defaultAppender.getLayout();
103       this.appender = new WriterAppender(layout, sw);
104       this.logger.addAppender(this.appender);
105     }
106 
107     String getOutput() {
108       return sw.toString();
109     }
110 
111     public void stopCapturing() {
112       this.logger.removeAppender(this.appender);
113     }
114   }
115 
116   /**
117    * This test HMaster class will always throw ServerNotRunningYetException if checked.
118    */
119   public static class NeverInitializedMaster extends HMaster {
120     public NeverInitializedMaster(Configuration conf, CoordinatedStateManager csm)
121         throws IOException, KeeperException, InterruptedException {
122       super(conf, csm);
123     }
124 
125     @Override
126     protected void checkServiceStarted() throws ServerNotRunningYetException {
127       throw new ServerNotRunningYetException("Server is not running yet");
128     }
129   }
130 
131   /**
132    * Tests region server should backoff to report for duty if master is not ready.
133    */
134   @Test
135   public void testReportForDutyBackoff() throws IOException, InterruptedException {
136     cluster.getConfiguration().set(HConstants.MASTER_IMPL, NeverInitializedMaster.class.getName());
137     master = cluster.addMaster();
138     master.start();
139 
140     LogCapturer capturer = new LogCapturer(org.apache.log4j.Logger.getLogger(HRegionServer.class));
141     // Set sleep interval relatively low so that exponential backoff is more demanding.
142     int msginterval = 100;
143     cluster.getConfiguration().setInt("hbase.regionserver.msginterval", msginterval);
144     rs = cluster.addRegionServer();
145     rs.start();
146 
147     int interval = 10_000;
148     Thread.sleep(interval);
149     capturer.stopCapturing();
150     String output = capturer.getOutput();
151     LOG.info(output);
152     String failMsg = "reportForDuty failed;";
153     int count = StringUtils.countMatches(output, failMsg);
154 
155     // Following asserts the actual retry number is in range (expectedRetry/2, expectedRetry*2).
156     // Ideally we can assert the exact retry count. We relax here to tolerate contention error.
157     int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval));
158     assertTrue(String.format("reportForDuty retries %d times, less than expected min %d",
159         count, expectedRetry / 2), count > expectedRetry / 2);
160     assertTrue(String.format("reportForDuty retries %d times, more than expected max %d",
161         count, expectedRetry * 2), count < expectedRetry * 2);
162   }
163 
164   /**
165    * Tests region sever reportForDuty with backup master becomes primary master after
166    * the first master goes away.
167    */
168   @Test (timeout=180000)
169   public void testReportForDutyWithMasterChange() throws Exception {
170 
171     // Start a master and wait for it to become the active/primary master.
172     // Use a random unique port
173     cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort());
174     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
175     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1);
176     master = cluster.addMaster();
177     rs = cluster.addRegionServer();
178     LOG.debug("Starting master: " + master.getMaster().getServerName());
179     master.start();
180     rs.start();
181 
182     waitForClusterOnline(master);
183 
184     // Add a 2nd region server
185     cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName());
186     rs2 = cluster.addRegionServer();
187     // Start the region server. This region server will refresh RPC connection
188     // from the current active master to the next active master before completing
189     // reportForDuty
190     LOG.debug("Starting 2nd region server: " + rs2.getRegionServer().getServerName());
191     rs2.start();
192 
193     waitForSecondRsStarted();
194 
195     // Stop the current master.
196     master.getMaster().stop("Stopping master");
197 
198     // Start a new master and use another random unique port
199     // Also let it wait for exactly 2 region severs to report in.
200     cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort());
201     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 2);
202     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
203     backupMaster = cluster.addMaster();
204     LOG.debug("Starting new master: " + backupMaster.getMaster().getServerName());
205     backupMaster.start();
206 
207     waitForClusterOnline(backupMaster);
208 
209     // Do some checking/asserts here.
210     assertTrue(backupMaster.getMaster().isActiveMaster());
211     assertTrue(backupMaster.getMaster().isInitialized());
212     assertEquals(backupMaster.getMaster().getServerManager().getOnlineServersList().size(), 2);
213 
214   }
215 
216   /**
217    * Tests region sever reportForDuty with manual environment edge
218    */
219   @Test(timeout = 60000)
220   public void testReportForDutyWithEnvironmentEdge() throws Exception {
221     // Start a master and wait for it to become the active/primary master.
222     // Use a random unique port
223     cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort());
224     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
225     cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1);
226 
227     // Inject manual environment edge for clock skew computation between RS and master
228     ManualEnvironmentEdge edge = new ManualEnvironmentEdge();
229     EnvironmentEdgeManager.injectEdge(edge);
230     master = cluster.addMaster();
231     rs = cluster.addRegionServer();
232     LOG.debug("Starting master: " + master.getMaster().getServerName());
233     master.start();
234     rs.start();
235 
236     waitForClusterOnline(master);
237   }
238 
239   private void waitForClusterOnline(MasterThread master) throws InterruptedException {
240     while (true) {
241       if (master.getMaster().isInitialized()) {
242         break;
243       }
244       Thread.sleep(SLEEP_INTERVAL);
245       LOG.debug("Waiting for master to come online ...");
246     }
247     rs.waitForServerOnline();
248   }
249 
250   private void waitForSecondRsStarted() throws InterruptedException {
251     while (true) {
252       if (((MyRegionServer) rs2.getRegionServer()).getRpcStubCreatedFlag() == true) {
253         break;
254       }
255       Thread.sleep(SLEEP_INTERVAL);
256       LOG.debug("Waiting 2nd RS to be started ...");
257     }
258   }
259 
260   // Create a Region Server that provide a hook so that we can wait for the master switch over
261   // before continuing reportForDuty to the mater.
262   // The idea is that we get a RPC connection to the first active master, then we wait.
263   // The first master goes down, the second master becomes the active master. The region
264   // server continues reportForDuty. It should succeed with the new master.
265   public static class MyRegionServer extends MiniHBaseClusterRegionServer {
266 
267     private ServerName sn;
268     // This flag is to make sure this rs has obtained the rpcStub to the first master.
269     // The first master will go down after this.
270     private boolean rpcStubCreatedFlag = false;
271     private boolean masterChanged = false;
272 
273     public MyRegionServer(Configuration conf, CoordinatedStateManager cp)
274       throws IOException, KeeperException,
275         InterruptedException {
276       super(conf, cp);
277     }
278 
279     @Override
280     @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="SWL_SLEEP_WITH_LOCK_HELD",
281       justification="Intended")
282     protected synchronized ServerName createRegionServerStatusStub(boolean refresh) {
283       sn = super.createRegionServerStatusStub(refresh);
284       rpcStubCreatedFlag = true;
285 
286       // Wait for master switch over. Only do this for the second region server.
287       while (!masterChanged) {
288         ServerName newSn = super.getMasterAddressTracker().getMasterAddress(true);
289         if (newSn != null && !newSn.equals(sn)) {
290           masterChanged = true;
291           break;
292         }
293         try {
294           Thread.sleep(SLEEP_INTERVAL);
295         } catch (InterruptedException e) {
296           return null;
297         }
298         LOG.debug("Waiting for master switch over ... ");
299       }
300       return sn;
301     }
302 
303     public boolean getRpcStubCreatedFlag() {
304       return rpcStubCreatedFlag;
305     }
306   }
307 }