001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.Threads;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
031import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
033
034/**
035 * This class defines methods that can help with managing HBase clusters from unit tests and system
036 * tests. There are 3 types of cluster deployments:
037 * <ul>
038 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, used by unit
039 * tests</li>
040 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
041 * interact with the cluster.</li>
042 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
043 * </li>
044 * </ul>
045 * <p>
046 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
047 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
048 * of nodes during execution of integration tests.
049 * <p>
050 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
051 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
052 * some tests will still need to mock stuff and introspect internal state. For those use cases from
053 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
054 * this class does not abstract away <strong>every</strong> interface that MiniHBaseCluster or
055 * DistributedHBaseCluster provide.
056 */
057@InterfaceAudience.Public
058public abstract class HBaseCluster implements Closeable, Configurable {
059  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
060  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
061  protected Configuration conf;
062
063  /** the status of the cluster before we begin */
064  protected ClusterMetrics initialClusterStatus;
065
066  /**
067   * Construct an HBaseCluster
068   * @param conf Configuration to be used for cluster
069   */
070  public HBaseCluster(Configuration conf) {
071    setConf(conf);
072  }
073
074  @Override
075  public void setConf(Configuration conf) {
076    this.conf = conf;
077  }
078
079  @Override
080  public Configuration getConf() {
081    return conf;
082  }
083
084  /**
085   * Returns a ClusterMetrics for this HBase cluster.
086   * @see #getInitialClusterMetrics()
087   */
088  public abstract ClusterMetrics getClusterMetrics() throws IOException;
089
090  /**
091   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
092   */
093  public ClusterMetrics getInitialClusterMetrics() throws IOException {
094    return initialClusterStatus;
095  }
096
097  /**
098   * Returns an {@link MasterService.BlockingInterface} to the active master
099   */
100  public abstract MasterService.BlockingInterface getMasterAdminService() throws IOException;
101
102  /**
103   * Returns an AdminProtocol interface to the regionserver
104   */
105  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
106    throws IOException;
107
108  /**
109   * Returns a ClientProtocol interface to the regionserver
110   */
111  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
112    throws IOException;
113
114  /**
115   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
116   * region server locally.
117   * @param hostname the hostname to start the regionserver on
118   * @throws IOException if something goes wrong
119   */
120  public abstract void startRegionServer(String hostname, int port) throws IOException;
121
122  /**
123   * Kills the region server process if this is a distributed cluster, otherwise this causes the
124   * region server to exit doing basic clean up only.
125   * @throws IOException if something goes wrong
126   */
127  public abstract void killRegionServer(ServerName serverName) throws IOException;
128
129  /**
130   * Keeping track of killed servers and being able to check if a particular server was killed makes
131   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
132   * example of such case is - killing servers and waiting for all regions of a particular table to
133   * be assigned. We can check for server column in META table and that its value is not one of the
134   * killed servers.
135   */
136  public abstract boolean isKilledRS(ServerName serverName);
137
138  /**
139   * Stops the given region server, by attempting a gradual stop.
140   * @throws IOException if something goes wrong
141   */
142  public abstract void stopRegionServer(ServerName serverName) throws IOException;
143
144  /**
145   * Wait for the specified region server to join the cluster
146   * @throws IOException if something goes wrong or timeout occurs
147   */
148  public void waitForRegionServerToStart(String hostname, int port, long timeout)
149    throws IOException {
150    long start = System.currentTimeMillis();
151    while ((System.currentTimeMillis() - start) < timeout) {
152      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
153        if (server.getHostname().equals(hostname) && server.getPort() == port) {
154          return;
155        }
156      }
157      Threads.sleep(100);
158    }
159    throw new IOException(
160      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
161  }
162
163  /**
164   * Wait for the specified region server to stop the thread / process.
165   * @throws IOException if something goes wrong or timeout occurs
166   */
167  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
168    throws IOException;
169
170  /**
171   * Suspend the region server
172   * @param serverName the hostname to suspend the regionserver on
173   * @throws IOException if something goes wrong
174   */
175  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
176
177  /**
178   * Resume the region server
179   * @param serverName the hostname to resume the regionserver on
180   * @throws IOException if something goes wrong
181   */
182  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
183
184  /**
185   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
186   * logs warning message.
187   * @param hostname the hostname to start the regionserver on
188   * @throws IOException if something goes wrong
189   */
190  public abstract void startZkNode(String hostname, int port) throws IOException;
191
192  /**
193   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
194   * master to exit doing basic clean up only.
195   * @throws IOException if something goes wrong
196   */
197  public abstract void killZkNode(ServerName serverName) throws IOException;
198
199  /**
200   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
201   * message.
202   * @throws IOException if something goes wrong
203   */
204  public abstract void stopZkNode(ServerName serverName) throws IOException;
205
206  /**
207   * Wait for the specified zookeeper node to join the cluster
208   * @throws IOException if something goes wrong or timeout occurs
209   */
210  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
211
212  /**
213   * Wait for the specified zookeeper node to stop the thread / process.
214   * @throws IOException if something goes wrong or timeout occurs
215   */
216  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
217
218  /**
219   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
220   * warning message.
221   * @throws IOException if something goes wrong
222   */
223  public abstract void startDataNode(ServerName serverName) throws IOException;
224
225  /**
226   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
227   * exit doing basic clean up only.
228   * @throws IOException if something goes wrong
229   */
230  public abstract void killDataNode(ServerName serverName) throws IOException;
231
232  /**
233   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
234   * @throws IOException if something goes wrong
235   */
236  public abstract void stopDataNode(ServerName serverName) throws IOException;
237
238  /**
239   * Wait for the specified datanode to join the cluster
240   * @throws IOException if something goes wrong or timeout occurs
241   */
242  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
243    throws IOException;
244
245  /**
246   * Wait for the specified datanode to stop the thread / process.
247   * @throws IOException if something goes wrong or timeout occurs
248   */
249  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
250    throws IOException;
251
252  /**
253   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
254   * warning message.
255   * @throws IOException if something goes wrong
256   */
257  public abstract void startNameNode(ServerName serverName) throws IOException;
258
259  /**
260   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
261   * exit doing basic clean up only.
262   * @throws IOException if something goes wrong
263   */
264  public abstract void killNameNode(ServerName serverName) throws IOException;
265
266  /**
267   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
268   * @throws IOException if something goes wrong
269   */
270  public abstract void stopNameNode(ServerName serverName) throws IOException;
271
272  /**
273   * Wait for the specified namenode to join the cluster
274   * @throws IOException if something goes wrong or timeout occurs
275   */
276  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
277    throws IOException;
278
279  /**
280   * Wait for the specified namenode to stop
281   * @throws IOException if something goes wrong or timeout occurs
282   */
283  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
284    throws IOException;
285
286  /**
287   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
288   * locally.
289   * @param hostname the hostname to start the master on
290   * @throws IOException if something goes wrong
291   */
292  public abstract void startMaster(String hostname, int port) throws IOException;
293
294  /**
295   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
296   * exit doing basic clean up only.
297   * @throws IOException if something goes wrong
298   */
299  public abstract void killMaster(ServerName serverName) throws IOException;
300
301  /**
302   * Stops the given master, by attempting a gradual stop.
303   * @throws IOException if something goes wrong
304   */
305  public abstract void stopMaster(ServerName serverName) throws IOException;
306
307  /**
308   * Wait for the specified master to stop the thread / process.
309   * @throws IOException if something goes wrong or timeout occurs
310   */
311  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
312
313  /**
314   * Blocks until there is an active master and that master has completed initialization.
315   * @return true if an active master becomes available. false if there are no masters left.
316   * @throws IOException if something goes wrong or timeout occurs
317   */
318  public boolean waitForActiveAndReadyMaster() throws IOException {
319    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
320  }
321
322  /**
323   * Blocks until there is an active master and that master has completed initialization.
324   * @param timeout the timeout limit in ms
325   * @return true if an active master becomes available. false if there are no masters left.
326   */
327  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
328
329  /**
330   * Wait for HBase Cluster to shut down.
331   */
332  public abstract void waitUntilShutDown() throws IOException;
333
334  /**
335   * Shut down the HBase cluster
336   */
337  public abstract void shutdown() throws IOException;
338
339  /**
340   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
341   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
342   * etc. restoration might be partial.
343   * @return whether restoration is complete
344   */
345  public boolean restoreInitialStatus() throws IOException {
346    return restoreClusterMetrics(getInitialClusterMetrics());
347  }
348
349  /**
350   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
351   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
352   * restoration might be partial.
353   * @return whether restoration is complete
354   */
355  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
356    return true;
357  }
358
359  /**
360   * Get the ServerName of region server serving the first hbase:meta region
361   */
362  public ServerName getServerHoldingMeta() throws IOException {
363    return getServerHoldingRegion(TableName.META_TABLE_NAME,
364      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
365  }
366
367  /**
368   * Get the ServerName of region server serving the specified region
369   * @param regionName Name of the region in bytes
370   * @param tn         Table name that has the region.
371   * @return ServerName that hosts the region or null
372   */
373  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
374    throws IOException;
375
376  /**
377   * @return whether we are interacting with a distributed cluster as opposed to an in-process
378   *         mini/local cluster.
379   */
380  public boolean isDistributedCluster() {
381    return false;
382  }
383
384  /**
385   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
386   * cluster.
387   * @see #shutdown()
388   */
389  @Override
390  public abstract void close() throws IOException;
391
392  /**
393   * Wait for the namenode.
394   */
395  public void waitForNamenodeAvailable() throws InterruptedException {
396  }
397
398  public void waitForDatanodesRegistered(int nbDN) throws Exception {
399  }
400}