1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.List;
24 import java.util.Set;
25 import java.util.TreeSet;
26
27 import org.apache.hadoop.conf.Configuration;
28 import org.apache.hadoop.hbase.ClusterManager.ServiceType;
29 import org.apache.hadoop.hbase.classification.InterfaceAudience;
30 import org.apache.hadoop.hbase.client.Admin;
31 import org.apache.hadoop.hbase.client.ClusterConnection;
32 import org.apache.hadoop.hbase.client.Connection;
33 import org.apache.hadoop.hbase.client.ConnectionFactory;
34 import org.apache.hadoop.hbase.client.RegionLocator;
35 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40 import org.apache.hadoop.hbase.util.Bytes;
41 import org.apache.hadoop.hbase.util.Threads;
42
43
44
45
46
47 @InterfaceAudience.Private
48 public class DistributedHBaseCluster extends HBaseCluster {
49 private Admin admin;
50 private final Connection connection;
51
52 private ClusterManager clusterManager;
53
54 public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
55 throws IOException {
56 super(conf);
57 this.clusterManager = clusterManager;
58 this.connection = ConnectionFactory.createConnection(conf);
59 this.admin = this.connection.getAdmin();
60 this.initialClusterStatus = getClusterStatus();
61 }
62
63 public void setClusterManager(ClusterManager clusterManager) {
64 this.clusterManager = clusterManager;
65 }
66
67 public ClusterManager getClusterManager() {
68 return clusterManager;
69 }
70
71
72
73
74
75 @Override
76 public ClusterStatus getClusterStatus() throws IOException {
77 return admin.getClusterStatus();
78 }
79
80 @Override
81 public ClusterStatus getInitialClusterStatus() throws IOException {
82 return initialClusterStatus;
83 }
84
85 @Override
86 public void close() throws IOException {
87 if (this.admin != null) {
88 admin.close();
89 }
90 if (this.connection != null && !this.connection.isClosed()) {
91 this.connection.close();
92 }
93 }
94
95 @Override
96 public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
97 throws IOException {
98 return ((ClusterConnection)this.connection).getAdmin(serverName);
99 }
100
101 @Override
102 public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
103 throws IOException {
104 return ((ClusterConnection)this.connection).getClient(serverName);
105 }
106
107 @Override
108 public void startRegionServer(String hostname, int port) throws IOException {
109 LOG.info("Starting RS on: " + hostname);
110 clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
111 }
112
113 @Override
114 public void killRegionServer(ServerName serverName) throws IOException {
115 LOG.info("Aborting RS: " + serverName.getServerName());
116 clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
117 serverName.getHostname(), serverName.getPort());
118 }
119
120 @Override
121 public void stopRegionServer(ServerName serverName) throws IOException {
122 LOG.info("Stopping RS: " + serverName.getServerName());
123 clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
124 serverName.getHostname(), serverName.getPort());
125 }
126
127 @Override
128 public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
129 waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
130 }
131
132 @Override
133 public void suspendRegionServer(ServerName serverName) throws IOException {
134 LOG.info("Suspend RS: " + serverName.getServerName());
135 clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
136 serverName.getHostname(), serverName.getPort());
137 }
138
139 @Override
140 public void resumeRegionServer(ServerName serverName) throws IOException {
141 LOG.info("Resume RS: " + serverName.getServerName());
142 clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
143 serverName.getHostname(), serverName.getPort());
144 }
145
146 @Override
147 public void startZkNode(String hostname, int port) throws IOException {
148 LOG.info("Starting Zookeeper node on: " + hostname);
149 clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
150 }
151
152 @Override
153 public void killZkNode(ServerName serverName) throws IOException {
154 LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
155 clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
156 serverName.getHostname(), serverName.getPort());
157 }
158
159 @Override
160 public void stopZkNode(ServerName serverName) throws IOException {
161 LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
162 clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
163 serverName.getHostname(), serverName.getPort());
164 }
165
166 @Override
167 public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
168 waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
169 }
170
171 @Override
172 public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
173 waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
174 }
175
176 @Override
177 public void startDataNode(ServerName serverName) throws IOException {
178 LOG.info("Starting data node on: " + serverName.getServerName());
179 clusterManager.start(ServiceType.HADOOP_DATANODE,
180 serverName.getHostname(), serverName.getPort());
181 }
182
183 @Override
184 public void killDataNode(ServerName serverName) throws IOException {
185 LOG.info("Aborting data node on: " + serverName.getServerName());
186 clusterManager.kill(ServiceType.HADOOP_DATANODE,
187 serverName.getHostname(), serverName.getPort());
188 }
189
190 @Override
191 public void stopDataNode(ServerName serverName) throws IOException {
192 LOG.info("Stopping data node on: " + serverName.getServerName());
193 clusterManager.stop(ServiceType.HADOOP_DATANODE,
194 serverName.getHostname(), serverName.getPort());
195 }
196
197 @Override
198 public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
199 waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
200 }
201
202 @Override
203 public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
204 waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
205 }
206
207 @Override
208 public void startNameNode(ServerName serverName) throws IOException {
209 LOG.info("Starting name node on: " + serverName.getServerName());
210 clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
211 serverName.getPort());
212 }
213
214 @Override
215 public void killNameNode(ServerName serverName) throws IOException {
216 LOG.info("Aborting name node on: " + serverName.getServerName());
217 clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
218 serverName.getPort());
219 }
220
221 @Override
222 public void stopNameNode(ServerName serverName) throws IOException {
223 LOG.info(String.format("Stopping name node on: %s", serverName.getServerName()));
224 clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
225 serverName.getPort());
226 }
227
228 @Override
229 public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
230 waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
231 }
232
233 @Override
234 public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
235 waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
236 }
237
238 private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
239 throws IOException {
240 LOG.info(
241 String.format("Waiting for service: %s to stop: %s", service, serverName.getServerName()));
242 long start = System.currentTimeMillis();
243
244 while ((System.currentTimeMillis() - start) < timeout) {
245 if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
246 return;
247 }
248 Threads.sleep(100);
249 }
250 throw new IOException("did timeout waiting for service to stop:" + serverName);
251 }
252
253 private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
254 throws IOException {
255 LOG.info(String.format(
256 "Waiting for service: %s to start: ", service, serverName.getServerName()));
257 long start = System.currentTimeMillis();
258
259 while ((System.currentTimeMillis() - start) < timeout) {
260 if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
261 return;
262 }
263 Threads.sleep(100);
264 }
265 throw new IOException("did timeout waiting for service to start:" + serverName);
266 }
267
268
269 @Override
270 public MasterService.BlockingInterface getMasterAdminService()
271 throws IOException {
272 return ((ClusterConnection)this.connection).getMaster();
273 }
274
275 @Override
276 public void startMaster(String hostname, int port) throws IOException {
277 LOG.info(String.format("Starting Master on: %s:%s", hostname, port));
278 clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
279 }
280
281 @Override
282 public void killMaster(ServerName serverName) throws IOException {
283 LOG.info("Aborting Master: " + serverName.getServerName());
284 clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
285 }
286
287 @Override
288 public void stopMaster(ServerName serverName) throws IOException {
289 LOG.info("Stopping Master: " + serverName.getServerName());
290 clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
291 }
292
293 @Override
294 public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
295 waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
296 }
297
298 @Override
299 public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
300 long start = System.currentTimeMillis();
301 while (System.currentTimeMillis() - start < timeout) {
302 try {
303 getMasterAdminService();
304 return true;
305 } catch (MasterNotRunningException m) {
306 LOG.warn("Master not started yet " + m);
307 } catch (ZooKeeperConnectionException e) {
308 LOG.warn("Failed to connect to ZK " + e);
309 }
310 Threads.sleep(1000);
311 }
312 return false;
313 }
314
315 @Override
316 public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws IOException {
317 HRegionLocation regionLoc = null;
318 try (RegionLocator locator = connection.getRegionLocator(tn)) {
319 regionLoc = locator.getRegionLocation(regionName, true);
320 }
321 if (regionLoc == null) {
322 LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName) +
323 ", start key [" + Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
324 return null;
325 }
326
327 AdminProtos.AdminService.BlockingInterface client =
328 ((ClusterConnection)this.connection).getAdmin(regionLoc.getServerName());
329 ServerInfo info = ProtobufUtil.getServerInfo(null, client);
330 return ProtobufUtil.toServerName(info.getServerName());
331 }
332
333 @Override
334 public void waitUntilShutDown() {
335
336 throw new RuntimeException("Not implemented yet");
337 }
338
339 @Override
340 public void shutdown() throws IOException {
341
342 throw new RuntimeException("Not implemented yet");
343 }
344
345 @Override
346 public boolean isDistributedCluster() {
347 return true;
348 }
349
350 @Override
351 public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
352 ClusterStatus current = getClusterStatus();
353
354 LOG.info("Restoring cluster - started");
355
356
357 boolean success = true;
358 success = restoreMasters(initial, current) & success;
359 success = restoreRegionServers(initial, current) & success;
360 success = restoreAdmin() & success;
361
362 LOG.info("Restoring cluster - done");
363 return success;
364 }
365
366 protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
367 List<IOException> deferred = new ArrayList<IOException>();
368
369 final ServerName initMaster = initial.getMaster();
370 if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
371 LOG.info("Restoring cluster - Initial active master : "
372 + initMaster.getHostAndPort()
373 + " has changed to : "
374 + current.getMaster().getHostAndPort());
375
376
377 try {
378 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
379 initMaster.getHostname(), initMaster.getPort())) {
380 LOG.info("Restoring cluster - starting initial active master at:"
381 + initMaster.getHostAndPort());
382 startMaster(initMaster.getHostname(), initMaster.getPort());
383 }
384
385
386
387
388
389 for (ServerName currentBackup : current.getBackupMasters()) {
390 if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
391 LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
392 stopMaster(currentBackup);
393 }
394 }
395 LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
396 stopMaster(current.getMaster());
397 waitForActiveAndReadyMaster();
398 } catch (IOException ex) {
399
400
401 deferred.add(ex);
402 }
403
404
405 for (ServerName backup : initial.getBackupMasters()) {
406 try {
407
408 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
409 backup.getHostname(),
410 backup.getPort())) {
411 LOG.info("Restoring cluster - starting initial backup master: "
412 + backup.getHostAndPort());
413 startMaster(backup.getHostname(), backup.getPort());
414 }
415 } catch (IOException ex) {
416 deferred.add(ex);
417 }
418 }
419 } else {
420
421 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
422 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
423 toStart.addAll(initial.getBackupMasters());
424 toKill.addAll(current.getBackupMasters());
425
426 for (ServerName server : current.getBackupMasters()) {
427 toStart.remove(server);
428 }
429 for (ServerName server: initial.getBackupMasters()) {
430 toKill.remove(server);
431 }
432
433 for (ServerName sn:toStart) {
434 try {
435 if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
436 LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
437 startMaster(sn.getHostname(), sn.getPort());
438 }
439 } catch (IOException ex) {
440 deferred.add(ex);
441 }
442 }
443
444 for (ServerName sn:toKill) {
445 try {
446 if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
447 LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
448 stopMaster(sn);
449 }
450 } catch (IOException ex) {
451 deferred.add(ex);
452 }
453 }
454 }
455 if (!deferred.isEmpty()) {
456 LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
457 deferred.size()));
458 for (int i=0; i<deferred.size() && i < 3; i++) {
459 LOG.warn(deferred.get(i));
460 }
461 }
462
463 return deferred.isEmpty();
464 }
465
466
467 private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
468 @Override
469 public int compare(ServerName o1, ServerName o2) {
470 int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
471 if (compare != 0) return compare;
472 compare = o1.getPort() - o2.getPort();
473 if (compare != 0) return compare;
474 return 0;
475 }
476 }
477
478 protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
479 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
480 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
481 toStart.addAll(initial.getServers());
482 toKill.addAll(current.getServers());
483
484 for (ServerName server : current.getServers()) {
485 toStart.remove(server);
486 }
487 for (ServerName server: initial.getServers()) {
488 toKill.remove(server);
489 }
490
491 List<IOException> deferred = new ArrayList<IOException>();
492
493 for(ServerName sn:toStart) {
494 try {
495 if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
496 sn.getHostname(),
497 sn.getPort())) {
498 LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
499 startRegionServer(sn.getHostname(), sn.getPort());
500 }
501 } catch (IOException ex) {
502 deferred.add(ex);
503 }
504 }
505
506 for(ServerName sn:toKill) {
507 try {
508 if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
509 sn.getHostname(),
510 sn.getPort())) {
511 LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
512 stopRegionServer(sn);
513 }
514 } catch (IOException ex) {
515 deferred.add(ex);
516 }
517 }
518 if (!deferred.isEmpty()) {
519 LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
520 deferred.size()));
521 for (int i=0; i<deferred.size() && i < 3; i++) {
522 LOG.warn(deferred.get(i));
523 }
524 }
525
526 return deferred.isEmpty();
527 }
528
529 protected boolean restoreAdmin() throws IOException {
530
531
532
533
534 try {
535 admin.close();
536 } catch (IOException ioe) {
537 LOG.warn("While closing the old connection", ioe);
538 }
539 this.admin = this.connection.getAdmin();
540 LOG.info("Added new HBaseAdmin");
541 return true;
542 }
543 }