View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Comparator;
23  import java.util.List;
24  import java.util.Set;
25  import java.util.TreeSet;
26  
27  import org.apache.hadoop.conf.Configuration;
28  import org.apache.hadoop.hbase.ClusterManager.ServiceType;
29  import org.apache.hadoop.hbase.classification.InterfaceAudience;
30  import org.apache.hadoop.hbase.client.Admin;
31  import org.apache.hadoop.hbase.client.ClusterConnection;
32  import org.apache.hadoop.hbase.client.Connection;
33  import org.apache.hadoop.hbase.client.ConnectionFactory;
34  import org.apache.hadoop.hbase.client.RegionLocator;
35  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40  import org.apache.hadoop.hbase.util.Bytes;
41  import org.apache.hadoop.hbase.util.Threads;
42  
43  /**
44   * Manages the interactions with an already deployed distributed cluster (as opposed to
45   * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
46   */
47  @InterfaceAudience.Private
48  public class DistributedHBaseCluster extends HBaseCluster {
49    private Admin admin;
50    private final Connection connection;
51  
52    private ClusterManager clusterManager;
53  
54    public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
55        throws IOException {
56      super(conf);
57      this.clusterManager = clusterManager;
58      this.connection = ConnectionFactory.createConnection(conf);
59      this.admin = this.connection.getAdmin();
60      this.initialClusterStatus = getClusterStatus();
61    }
62  
63    public void setClusterManager(ClusterManager clusterManager) {
64      this.clusterManager = clusterManager;
65    }
66  
67    public ClusterManager getClusterManager() {
68      return clusterManager;
69    }
70  
71    /**
72     * Returns a ClusterStatus for this HBase cluster
73     * @throws IOException
74     */
75    @Override
76    public ClusterStatus getClusterStatus() throws IOException {
77      return admin.getClusterStatus();
78    }
79  
80    @Override
81    public ClusterStatus getInitialClusterStatus() throws IOException {
82      return initialClusterStatus;
83    }
84  
85    @Override
86    public void close() throws IOException {
87      if (this.admin != null) {
88        admin.close();
89      }
90      if (this.connection != null && !this.connection.isClosed()) {
91        this.connection.close();
92      }
93    }
94  
95    @Override
96    public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
97    throws IOException {
98      return ((ClusterConnection)this.connection).getAdmin(serverName);
99    }
100 
101   @Override
102   public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
103   throws IOException {
104     return ((ClusterConnection)this.connection).getClient(serverName);
105   }
106 
107   @Override
108   public void startRegionServer(String hostname, int port) throws IOException {
109     LOG.info("Starting RS on: " + hostname);
110     clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
111   }
112 
113   @Override
114   public void killRegionServer(ServerName serverName) throws IOException {
115     LOG.info("Aborting RS: " + serverName.getServerName());
116     clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
117       serverName.getHostname(), serverName.getPort());
118   }
119 
120   @Override
121   public void stopRegionServer(ServerName serverName) throws IOException {
122     LOG.info("Stopping RS: " + serverName.getServerName());
123     clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
124       serverName.getHostname(), serverName.getPort());
125   }
126 
127   @Override
128   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
129     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
130   }
131 
132   @Override
133   public void suspendRegionServer(ServerName serverName) throws IOException {
134     LOG.info("Suspend RS: " + serverName.getServerName());
135     clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
136         serverName.getHostname(), serverName.getPort());
137   }
138 
139   @Override
140   public void resumeRegionServer(ServerName serverName) throws IOException {
141     LOG.info("Resume RS: " + serverName.getServerName());
142     clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
143         serverName.getHostname(), serverName.getPort());
144   }
145 
146   @Override
147   public void startZkNode(String hostname, int port) throws IOException {
148     LOG.info("Starting Zookeeper node on: " + hostname);
149     clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
150   }
151 
152   @Override
153   public void killZkNode(ServerName serverName) throws IOException {
154     LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
155     clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
156       serverName.getHostname(), serverName.getPort());
157   }
158 
159   @Override
160   public void stopZkNode(ServerName serverName) throws IOException {
161     LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
162     clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
163       serverName.getHostname(), serverName.getPort());
164   }
165 
166   @Override
167   public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
168     waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
169   }
170 
171   @Override
172   public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
173     waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
174   }
175 
176   @Override
177   public void startDataNode(ServerName serverName) throws IOException {
178     LOG.info("Starting data node on: " + serverName.getServerName());
179     clusterManager.start(ServiceType.HADOOP_DATANODE,
180       serverName.getHostname(), serverName.getPort());
181   }
182 
183   @Override
184   public void killDataNode(ServerName serverName) throws IOException {
185     LOG.info("Aborting data node on: " + serverName.getServerName());
186     clusterManager.kill(ServiceType.HADOOP_DATANODE,
187       serverName.getHostname(), serverName.getPort());
188   }
189 
190   @Override
191   public void stopDataNode(ServerName serverName) throws IOException {
192     LOG.info("Stopping data node on: " + serverName.getServerName());
193     clusterManager.stop(ServiceType.HADOOP_DATANODE,
194       serverName.getHostname(), serverName.getPort());
195   }
196 
197   @Override
198   public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
199     waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
200   }
201 
202   @Override
203   public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
204     waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
205   }
206 
207   @Override
208   public void startNameNode(ServerName serverName) throws IOException {
209     LOG.info("Starting name node on: " + serverName.getServerName());
210     clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
211       serverName.getPort());
212   }
213 
214   @Override
215   public void killNameNode(ServerName serverName) throws IOException {
216     LOG.info("Aborting name node on: " + serverName.getServerName());
217     clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
218       serverName.getPort());
219   }
220 
221   @Override
222   public void stopNameNode(ServerName serverName) throws IOException {
223     LOG.info(String.format("Stopping name node on: %s", serverName.getServerName()));
224     clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
225       serverName.getPort());
226   }
227 
228   @Override
229   public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
230     waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
231   }
232 
233   @Override
234   public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
235     waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
236   }
237 
238   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
239     throws IOException {
240     LOG.info(
241         String.format("Waiting for service: %s to stop: %s", service, serverName.getServerName()));
242     long start = System.currentTimeMillis();
243 
244     while ((System.currentTimeMillis() - start) < timeout) {
245       if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
246         return;
247       }
248       Threads.sleep(100);
249     }
250     throw new IOException("did timeout waiting for service to stop:" + serverName);
251   }
252 
253   private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
254     throws IOException {
255     LOG.info(String.format(
256         "Waiting for service: %s to start: ", service, serverName.getServerName()));
257     long start = System.currentTimeMillis();
258 
259     while ((System.currentTimeMillis() - start) < timeout) {
260       if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
261         return;
262       }
263       Threads.sleep(100);
264     }
265     throw new IOException("did timeout waiting for service to start:" + serverName);
266   }
267 
268 
269   @Override
270   public MasterService.BlockingInterface getMasterAdminService()
271   throws IOException {
272     return ((ClusterConnection)this.connection).getMaster();
273   }
274 
275   @Override
276   public void startMaster(String hostname, int port) throws IOException {
277     LOG.info(String.format("Starting Master on: %s:%s", hostname, port));
278     clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
279   }
280 
281   @Override
282   public void killMaster(ServerName serverName) throws IOException {
283     LOG.info("Aborting Master: " + serverName.getServerName());
284     clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
285   }
286 
287   @Override
288   public void stopMaster(ServerName serverName) throws IOException {
289     LOG.info("Stopping Master: " + serverName.getServerName());
290     clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
291   }
292 
293   @Override
294   public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
295     waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
296   }
297 
298   @Override
299   public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
300     long start = System.currentTimeMillis();
301     while (System.currentTimeMillis() - start < timeout) {
302       try {
303         getMasterAdminService();
304         return true;
305       } catch (MasterNotRunningException m) {
306         LOG.warn("Master not started yet " + m);
307       } catch (ZooKeeperConnectionException e) {
308         LOG.warn("Failed to connect to ZK " + e);
309       }
310       Threads.sleep(1000);
311     }
312     return false;
313   }
314 
315   @Override
316   public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws IOException {
317     HRegionLocation regionLoc = null;
318     try (RegionLocator locator = connection.getRegionLocator(tn)) {
319       regionLoc = locator.getRegionLocation(regionName, true);
320     }
321     if (regionLoc == null) {
322       LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName) +
323         ", start key [" + Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
324       return null;
325     }
326 
327     AdminProtos.AdminService.BlockingInterface client =
328         ((ClusterConnection)this.connection).getAdmin(regionLoc.getServerName());
329     ServerInfo info = ProtobufUtil.getServerInfo(null, client);
330     return ProtobufUtil.toServerName(info.getServerName());
331   }
332 
333   @Override
334   public void waitUntilShutDown() {
335     // Simply wait for a few seconds for now (after issuing serverManager.kill
336     throw new RuntimeException("Not implemented yet");
337   }
338 
339   @Override
340   public void shutdown() throws IOException {
341     // not sure we want this
342     throw new RuntimeException("Not implemented yet");
343   }
344 
345   @Override
346   public boolean isDistributedCluster() {
347     return true;
348   }
349 
350   @Override
351   public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
352     ClusterStatus current = getClusterStatus();
353 
354     LOG.info("Restoring cluster - started");
355 
356     // do a best effort restore
357     boolean success = true;
358     success = restoreMasters(initial, current) & success;
359     success = restoreRegionServers(initial, current) & success;
360     success = restoreAdmin() & success;
361 
362     LOG.info("Restoring cluster - done");
363     return success;
364   }
365 
366   protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
367     List<IOException> deferred = new ArrayList<IOException>();
368     //check whether current master has changed
369     final ServerName initMaster = initial.getMaster();
370     if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
371       LOG.info("Restoring cluster - Initial active master : "
372               + initMaster.getHostAndPort()
373               + " has changed to : "
374               + current.getMaster().getHostAndPort());
375       // If initial master is stopped, start it, before restoring the state.
376       // It will come up as a backup master, if there is already an active master.
377       try {
378         if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
379                 initMaster.getHostname(), initMaster.getPort())) {
380           LOG.info("Restoring cluster - starting initial active master at:"
381                   + initMaster.getHostAndPort());
382           startMaster(initMaster.getHostname(), initMaster.getPort());
383         }
384 
385         // master has changed, we would like to undo this.
386         // 1. Kill the current backups
387         // 2. Stop current master
388         // 3. Start backup masters
389         for (ServerName currentBackup : current.getBackupMasters()) {
390           if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
391             LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
392             stopMaster(currentBackup);
393           }
394         }
395         LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
396         stopMaster(current.getMaster());
397         waitForActiveAndReadyMaster(); // wait so that active master takes over
398       } catch (IOException ex) {
399         // if we fail to start the initial active master, we do not want to continue stopping
400         // backup masters. Just keep what we have now
401         deferred.add(ex);
402       }
403 
404       //start backup masters
405       for (ServerName backup : initial.getBackupMasters()) {
406         try {
407           //these are not started in backup mode, but we should already have an active master
408           if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
409                   backup.getHostname(),
410                   backup.getPort())) {
411             LOG.info("Restoring cluster - starting initial backup master: "
412                     + backup.getHostAndPort());
413             startMaster(backup.getHostname(), backup.getPort());
414           }
415         } catch (IOException ex) {
416           deferred.add(ex);
417         }
418       }
419     } else {
420       //current master has not changed, match up backup masters
421       Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
422       Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
423       toStart.addAll(initial.getBackupMasters());
424       toKill.addAll(current.getBackupMasters());
425 
426       for (ServerName server : current.getBackupMasters()) {
427         toStart.remove(server);
428       }
429       for (ServerName server: initial.getBackupMasters()) {
430         toKill.remove(server);
431       }
432 
433       for (ServerName sn:toStart) {
434         try {
435           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
436             LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
437             startMaster(sn.getHostname(), sn.getPort());
438           }
439         } catch (IOException ex) {
440           deferred.add(ex);
441         }
442       }
443 
444       for (ServerName sn:toKill) {
445         try {
446           if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
447             LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
448             stopMaster(sn);
449           }
450         } catch (IOException ex) {
451           deferred.add(ex);
452         }
453       }
454     }
455     if (!deferred.isEmpty()) {
456       LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
457           deferred.size()));
458       for (int i=0; i<deferred.size() && i < 3; i++) {
459         LOG.warn(deferred.get(i));
460       }
461     }
462 
463     return deferred.isEmpty();
464   }
465 
466 
467   private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
468     @Override
469     public int compare(ServerName o1, ServerName o2) {
470       int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
471       if (compare != 0) return compare;
472       compare = o1.getPort() - o2.getPort();
473       if (compare != 0) return compare;
474       return 0;
475     }
476   }
477 
478   protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
479     Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
480     Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
481     toStart.addAll(initial.getServers());
482     toKill.addAll(current.getServers());
483 
484     for (ServerName server : current.getServers()) {
485       toStart.remove(server);
486     }
487     for (ServerName server: initial.getServers()) {
488       toKill.remove(server);
489     }
490 
491     List<IOException> deferred = new ArrayList<IOException>();
492 
493     for(ServerName sn:toStart) {
494       try {
495         if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
496                 sn.getHostname(),
497                 sn.getPort())) {
498           LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
499           startRegionServer(sn.getHostname(), sn.getPort());
500         }
501       } catch (IOException ex) {
502         deferred.add(ex);
503       }
504     }
505 
506     for(ServerName sn:toKill) {
507       try {
508         if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
509                 sn.getHostname(),
510                 sn.getPort())) {
511           LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
512           stopRegionServer(sn);
513         }
514       } catch (IOException ex) {
515         deferred.add(ex);
516       }
517     }
518     if (!deferred.isEmpty()) {
519       LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
520           deferred.size()));
521       for (int i=0; i<deferred.size() && i < 3; i++) {
522         LOG.warn(deferred.get(i));
523       }
524     }
525 
526     return deferred.isEmpty();
527   }
528 
529   protected boolean restoreAdmin() throws IOException {
530     // While restoring above, if the HBase Master which was initially the Active one, was down
531     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
532     // to refresh its connections (otherwise it will return incorrect information) or we can
533     // point it to new instance.
534     try {
535       admin.close();
536     } catch (IOException ioe) {
537       LOG.warn("While closing the old connection", ioe);
538     }
539     this.admin = this.connection.getAdmin();
540     LOG.info("Added new HBaseAdmin");
541     return true;
542   }
543 }