1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.tool;
21
22 import static org.apache.hadoop.hbase.HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT;
23 import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
24
25 import com.google.common.collect.Lists;
26 import java.io.Closeable;
27 import java.io.IOException;
28 import java.net.InetSocketAddress;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.HashSet;
34 import java.util.LinkedList;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Random;
38 import java.util.Set;
39 import java.util.TreeSet;
40 import java.util.concurrent.Callable;
41 import java.util.concurrent.ConcurrentHashMap;
42 import java.util.concurrent.ExecutionException;
43 import java.util.concurrent.ExecutorService;
44 import java.util.concurrent.Future;
45 import java.util.concurrent.ScheduledThreadPoolExecutor;
46 import java.util.concurrent.atomic.AtomicLong;
47 import java.util.regex.Matcher;
48 import java.util.regex.Pattern;
49 import org.apache.commons.lang.time.StopWatch;
50 import org.apache.hadoop.conf.Configuration;
51 import org.apache.hadoop.hbase.AuthUtil;
52 import org.apache.hadoop.hbase.ChoreService;
53 import org.apache.hadoop.hbase.ClusterStatus;
54 import org.apache.hadoop.hbase.DoNotRetryIOException;
55 import org.apache.hadoop.hbase.HBaseConfiguration;
56 import org.apache.hadoop.hbase.HBaseInterfaceAudience;
57 import org.apache.hadoop.hbase.HColumnDescriptor;
58 import org.apache.hadoop.hbase.HConstants;
59 import org.apache.hadoop.hbase.HRegionInfo;
60 import org.apache.hadoop.hbase.HRegionLocation;
61 import org.apache.hadoop.hbase.HTableDescriptor;
62 import org.apache.hadoop.hbase.NamespaceDescriptor;
63 import org.apache.hadoop.hbase.ScheduledChore;
64 import org.apache.hadoop.hbase.ServerName;
65 import org.apache.hadoop.hbase.TableName;
66 import org.apache.hadoop.hbase.TableNotEnabledException;
67 import org.apache.hadoop.hbase.TableNotFoundException;
68 import org.apache.hadoop.hbase.classification.InterfaceAudience;
69 import org.apache.hadoop.hbase.client.Admin;
70 import org.apache.hadoop.hbase.client.Connection;
71 import org.apache.hadoop.hbase.client.ConnectionFactory;
72 import org.apache.hadoop.hbase.client.Get;
73 import org.apache.hadoop.hbase.client.Put;
74 import org.apache.hadoop.hbase.client.RegionLocator;
75 import org.apache.hadoop.hbase.client.ResultScanner;
76 import org.apache.hadoop.hbase.client.Scan;
77 import org.apache.hadoop.hbase.client.Table;
78 import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
79 import org.apache.hadoop.hbase.tool.CanaryTool.RegionTask.TaskType;
80 import org.apache.hadoop.hbase.util.Bytes;
81 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
82 import org.apache.hadoop.hbase.util.ReflectionUtils;
83 import org.apache.hadoop.hbase.util.RegionSplitter;
84 import org.apache.hadoop.hbase.zookeeper.EmptyWatcher;
85 import org.apache.hadoop.hbase.zookeeper.ZKConfig;
86 import org.apache.hadoop.util.Tool;
87 import org.apache.hadoop.util.ToolRunner;
88 import org.apache.zookeeper.KeeperException;
89 import org.apache.zookeeper.ZooKeeper;
90 import org.apache.zookeeper.client.ConnectStringParser;
91 import org.apache.zookeeper.data.Stat;
92 import org.slf4j.Logger;
93 import org.slf4j.LoggerFactory;
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
114 public class CanaryTool implements Tool, Canary {
115
116 @Override
117 public int checkRegions(String[] targets) throws Exception {
118 String configuredReadTableTimeoutsStr = conf.get(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT);
119 try {
120 if (configuredReadTableTimeoutsStr != null) {
121 populateReadTableTimeoutsMap(configuredReadTableTimeoutsStr);
122 }
123 } catch (IllegalArgumentException e) {
124 LOG.error("Constructing read table timeouts map failed ", e);
125 return USAGE_EXIT_CODE;
126 }
127 return runMonitor(targets);
128 }
129
130 @Override
131 public int checkRegionServers(String[] targets) throws Exception {
132 regionServerMode = true;
133 return runMonitor(targets);
134 }
135
136 @Override
137 public int checkZooKeeper() throws Exception {
138 zookeeperMode = true;
139 return runMonitor(null);
140 }
141
142
143
144
145 public interface Sink {
146 long getReadFailureCount();
147 long incReadFailureCount();
148 Map<String,String> getReadFailures();
149 void updateReadFailures(String regionName, String serverName);
150 long getWriteFailureCount();
151 long incWriteFailureCount();
152 Map<String,String> getWriteFailures();
153 void updateWriteFailures(String regionName, String serverName);
154 long getReadSuccessCount();
155 long incReadSuccessCount();
156 long getWriteSuccessCount();
157 long incWriteSuccessCount();
158 }
159
160
161
162
163 public static class StdOutSink implements Sink {
164 private AtomicLong readFailureCount = new AtomicLong(0),
165 writeFailureCount = new AtomicLong(0),
166 readSuccessCount = new AtomicLong(0),
167 writeSuccessCount = new AtomicLong(0);
168 private Map<String, String> readFailures = new ConcurrentHashMap<>();
169 private Map<String, String> writeFailures = new ConcurrentHashMap<>();
170
171 @Override
172 public long getReadFailureCount() {
173 return readFailureCount.get();
174 }
175
176 @Override
177 public long incReadFailureCount() {
178 return readFailureCount.incrementAndGet();
179 }
180
181 @Override
182 public Map<String, String> getReadFailures() {
183 return readFailures;
184 }
185
186 @Override
187 public void updateReadFailures(String regionName, String serverName) {
188 readFailures.put(regionName, serverName);
189 }
190
191 @Override
192 public long getWriteFailureCount() {
193 return writeFailureCount.get();
194 }
195
196 @Override
197 public long incWriteFailureCount() {
198 return writeFailureCount.incrementAndGet();
199 }
200
201 @Override
202 public Map<String, String> getWriteFailures() {
203 return writeFailures;
204 }
205
206 @Override
207 public void updateWriteFailures(String regionName, String serverName) {
208 writeFailures.put(regionName, serverName);
209 }
210
211 @Override
212 public long getReadSuccessCount() {
213 return readSuccessCount.get();
214 }
215
216 @Override
217 public long incReadSuccessCount() {
218 return readSuccessCount.incrementAndGet();
219 }
220
221 @Override
222 public long getWriteSuccessCount() {
223 return writeSuccessCount.get();
224 }
225
226 @Override
227 public long incWriteSuccessCount() {
228 return writeSuccessCount.incrementAndGet();
229 }
230 }
231
232
233
234
235 public static class RegionServerStdOutSink extends StdOutSink {
236 public void publishReadFailure(String table, String server) {
237 incReadFailureCount();
238 LOG.error("Read from {} on {}", table, server);
239 }
240
241 public void publishReadTiming(String table, String server, long msTime) {
242 LOG.info("Read from {} on {} in {}ms", table, server, msTime);
243 }
244 }
245
246
247
248
249 public static class ZookeeperStdOutSink extends StdOutSink {
250 public void publishReadFailure(String znode, String server) {
251 incReadFailureCount();
252 LOG.error("Read from {} on {}", znode, server);
253 }
254
255 public void publishReadTiming(String znode, String server, long msTime) {
256 LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
257 }
258 }
259
260
261
262
263 public static class RegionStdOutSink extends StdOutSink {
264 private Map<String, AtomicLong> perTableReadLatency = new HashMap<>();
265 private AtomicLong writeLatency = new AtomicLong();
266 private final Map<String, List<RegionTaskResult>> regionMap = new ConcurrentHashMap<>();
267
268 public void publishReadFailure(ServerName serverName, HRegionInfo region, Exception e) {
269 incReadFailureCount();
270 LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
271 }
272
273 public void publishReadFailure(ServerName serverName, HRegionInfo region,
274 HColumnDescriptor column, Exception e) {
275 incReadFailureCount();
276 LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
277 column.getNameAsString(), e);
278 }
279
280 public void publishReadTiming(ServerName serverName, HRegionInfo region,
281 HColumnDescriptor column, long msTime) {
282 RegionTaskResult rtr = new RegionTaskResult(region, region.getTable(), serverName, column);
283 rtr.setReadSuccess();
284 rtr.setReadLatency(msTime);
285 List<RegionTaskResult> rtrs = regionMap.get(region.getRegionNameAsString());
286 rtrs.add(rtr);
287
288 incReadSuccessCount();
289 LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
290 column.getNameAsString(), msTime);
291 }
292
293 public void publishWriteFailure(ServerName serverName, HRegionInfo region, Exception e) {
294 incWriteFailureCount();
295 LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
296 }
297
298 public void publishWriteFailure(ServerName serverName, HRegionInfo region,
299 HColumnDescriptor column, Exception e) {
300 incWriteFailureCount();
301 LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
302 column.getNameAsString(), e);
303 }
304
305 public void publishWriteTiming(ServerName serverName, HRegionInfo region,
306 HColumnDescriptor column, long msTime) {
307 RegionTaskResult rtr = new RegionTaskResult(region, region.getTable(), serverName, column);
308 rtr.setWriteSuccess();
309 rtr.setWriteLatency(msTime);
310 List<RegionTaskResult> rtrs = regionMap.get(region.getRegionNameAsString());
311 rtrs.add(rtr);
312
313 incWriteSuccessCount();
314 LOG.info("Write to {} on {} {} in {}ms",
315 region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
316 }
317
318 public Map<String, AtomicLong> getReadLatencyMap() {
319 return this.perTableReadLatency;
320 }
321
322 public AtomicLong initializeAndGetReadLatencyForTable(String tableName) {
323 AtomicLong initLatency = new AtomicLong();
324 this.perTableReadLatency.put(tableName, initLatency);
325 return initLatency;
326 }
327
328 public void initializeWriteLatency() {
329 this.writeLatency.set(0);
330 }
331
332 public AtomicLong getWriteLatency() {
333 return this.writeLatency;
334 }
335
336 public Map<String, List<RegionTaskResult>> getRegionMap() {
337 return this.regionMap;
338 }
339
340 public int getTotalExpectedRegions() {
341 return this.regionMap.size();
342 }
343 }
344
345
346
347
348 static class ZookeeperTask implements Callable<Void> {
349 private final Connection connection;
350 private final String host;
351 private String znode;
352 private final int timeout;
353 private ZookeeperStdOutSink sink;
354
355 public ZookeeperTask(Connection connection, String host, String znode, int timeout,
356 ZookeeperStdOutSink sink) {
357 this.connection = connection;
358 this.host = host;
359 this.znode = znode;
360 this.timeout = timeout;
361 this.sink = sink;
362 }
363
364 @Override public Void call() throws Exception {
365 ZooKeeper zooKeeper = null;
366 try {
367 zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance);
368 Stat exists = zooKeeper.exists(znode, false);
369 StopWatch stopwatch = new StopWatch();
370 stopwatch.start();
371 zooKeeper.getData(znode, false, exists);
372 stopwatch.stop();
373 sink.publishReadTiming(znode, host, stopwatch.getTime());
374 } catch (KeeperException | InterruptedException e) {
375 sink.publishReadFailure(znode, host);
376 } finally {
377 if (zooKeeper != null) {
378 zooKeeper.close();
379 }
380 }
381 return null;
382 }
383 }
384
385
386
387
388
389 static class RegionTask implements Callable<Void> {
390 public enum TaskType{
391 READ, WRITE
392 }
393 private Connection connection;
394 private HRegionInfo region;
395 private RegionStdOutSink sink;
396 private TaskType taskType;
397 private boolean rawScanEnabled;
398 private ServerName serverName;
399 private AtomicLong readWriteLatency;
400
401 RegionTask(Connection connection, HRegionInfo region, ServerName serverName,
402 RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, AtomicLong rwLatency) {
403 this.connection = connection;
404 this.region = region;
405 this.serverName = serverName;
406 this.sink = sink;
407 this.taskType = taskType;
408 this.rawScanEnabled = rawScanEnabled;
409 this.readWriteLatency = rwLatency;
410 }
411
412 @Override
413 public Void call() {
414 switch (taskType) {
415 case READ:
416 return read();
417 case WRITE:
418 return write();
419 default:
420 return read();
421 }
422 }
423
424 public Void read() {
425 Table table = null;
426 HTableDescriptor tableDesc = null;
427 try {
428 LOG.debug("Reading table descriptor for table {}", region.getTable());
429 table = connection.getTable(region.getTable());
430 tableDesc = table.getTableDescriptor();
431 } catch (IOException e) {
432 LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
433 sink.publishReadFailure(serverName, region, e);
434 if (table != null) {
435 try {
436 table.close();
437 } catch (IOException ioe) {
438 LOG.error("Close table failed", e);
439 }
440 }
441 return null;
442 }
443
444 byte[] startKey = null;
445 Get get = null;
446 Scan scan = null;
447 ResultScanner rs = null;
448 StopWatch stopWatch = new StopWatch();
449 for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
450 stopWatch.reset();
451 startKey = region.getStartKey();
452
453 if (startKey.length > 0) {
454 get = new Get(startKey);
455 get.setCacheBlocks(false);
456 get.setFilter(new FirstKeyOnlyFilter());
457 get.addFamily(column.getName());
458 } else {
459 scan = new Scan();
460 LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName());
461 scan.setRaw(rawScanEnabled);
462 scan.setCaching(1);
463 scan.setCacheBlocks(false);
464 scan.setFilter(new FirstKeyOnlyFilter());
465 scan.addFamily(column.getName());
466 scan.setMaxResultSize(1L);
467 scan.setOneRowLimit();
468 }
469 LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(),
470 region.getRegionNameAsString(), column.getNameAsString(),
471 Bytes.toStringBinary(startKey));
472 try {
473 stopWatch.start();
474 if (startKey.length > 0) {
475 table.get(get);
476 } else {
477 rs = table.getScanner(scan);
478 rs.next();
479 }
480 stopWatch.stop();
481 this.readWriteLatency.addAndGet(stopWatch.getTime());
482 sink.publishReadTiming(serverName, region, column, stopWatch.getTime());
483 } catch (Exception e) {
484 sink.publishReadFailure(serverName, region, column, e);
485 sink.updateReadFailures(region.getRegionNameAsString(), serverName.getHostname());
486 } finally {
487 if (rs != null) {
488 rs.close();
489 }
490 scan = null;
491 get = null;
492 }
493 }
494 try {
495 table.close();
496 } catch (IOException e) {
497 LOG.error("Close table failed", e);
498 }
499 return null;
500 }
501
502
503
504
505 private Void write() {
506 Table table = null;
507 HTableDescriptor tableDesc = null;
508 try {
509 table = connection.getTable(region.getTable());
510 tableDesc = table.getTableDescriptor();
511 byte[] rowToCheck = region.getStartKey();
512 if (rowToCheck.length == 0) {
513 rowToCheck = new byte[]{0x0};
514 }
515 int writeValueSize =
516 connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
517 for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
518 Put put = new Put(rowToCheck);
519 byte[] value = new byte[writeValueSize];
520 Bytes.random(value);
521 put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
522
523 LOG.debug("Writing to {} {} {} {}",
524 tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
525 Bytes.toStringBinary(rowToCheck));
526 try {
527 long startTime = System.currentTimeMillis();
528 table.put(put);
529 long time = System.currentTimeMillis() - startTime;
530 this.readWriteLatency.addAndGet(time);
531 sink.publishWriteTiming(serverName, region, column, time);
532 } catch (Exception e) {
533 sink.publishWriteFailure(serverName, region, column, e);
534 }
535 }
536 table.close();
537 } catch (IOException e) {
538 sink.publishWriteFailure(serverName, region, e);
539 sink.updateWriteFailures(region.getRegionNameAsString(), serverName.getHostname() );
540 }
541 return null;
542 }
543 }
544
545
546
547
548
549 static class RegionServerTask implements Callable<Void> {
550 private Connection connection;
551 private String serverName;
552 private HRegionInfo region;
553 private RegionServerStdOutSink sink;
554 private AtomicLong successes;
555
556 RegionServerTask(Connection connection, String serverName, HRegionInfo region,
557 RegionServerStdOutSink sink, AtomicLong successes) {
558 this.connection = connection;
559 this.serverName = serverName;
560 this.region = region;
561 this.sink = sink;
562 this.successes = successes;
563 }
564
565 @Override
566 public Void call() {
567 TableName tableName = null;
568 Table table = null;
569 Get get = null;
570 byte[] startKey = null;
571 Scan scan = null;
572 StopWatch stopWatch = new StopWatch();
573
574 stopWatch.reset();
575 try {
576 tableName = region.getTable();
577 table = connection.getTable(tableName);
578 startKey = region.getStartKey();
579
580 LOG.debug("Reading from {} {} {} {}",
581 serverName, region.getTable(), region.getRegionNameAsString(),
582 Bytes.toStringBinary(startKey));
583 if (startKey.length > 0) {
584 get = new Get(startKey);
585 get.setCacheBlocks(false);
586 get.setFilter(new FirstKeyOnlyFilter());
587 stopWatch.start();
588 table.get(get);
589 stopWatch.stop();
590 } else {
591 scan = new Scan();
592 scan.setCacheBlocks(false);
593 scan.setFilter(new FirstKeyOnlyFilter());
594 scan.setCaching(1);
595 scan.setMaxResultSize(1L);
596 scan.setOneRowLimit();
597 stopWatch.start();
598 ResultScanner s = table.getScanner(scan);
599 s.next();
600 s.close();
601 stopWatch.stop();
602 }
603 successes.incrementAndGet();
604 sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
605 } catch (TableNotFoundException tnfe) {
606 LOG.error("Table may be deleted", tnfe);
607
608 } catch (TableNotEnabledException tnee) {
609
610 successes.incrementAndGet();
611 LOG.debug("The targeted table was disabled. Assuming success.");
612 } catch (DoNotRetryIOException dnrioe) {
613 sink.publishReadFailure(tableName.getNameAsString(), serverName);
614 LOG.error(dnrioe.toString(), dnrioe);
615 } catch (IOException e) {
616 sink.publishReadFailure(tableName.getNameAsString(), serverName);
617 LOG.error(e.toString(), e);
618 } finally {
619 if (table != null) {
620 try {
621 table.close();
622 } catch (IOException e) {
623 LOG.error("Close table failed", e);
624 }
625 }
626 scan = null;
627 get = null;
628 startKey = null;
629 }
630 return null;
631 }
632 }
633
634 private static final int USAGE_EXIT_CODE = 1;
635 private static final int INIT_ERROR_EXIT_CODE = 2;
636 private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
637 private static final int ERROR_EXIT_CODE = 4;
638 private static final int FAILURE_EXIT_CODE = 5;
639
640 private static final long DEFAULT_INTERVAL = 60000;
641
642 private static final long DEFAULT_TIMEOUT = 600000;
643 private static final int MAX_THREADS_NUM = 16;
644
645 private static final Logger LOG = LoggerFactory.getLogger(Canary.class);
646
647 public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
648 NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
649
650 private static final String CANARY_TABLE_FAMILY_NAME = "Test";
651
652 private Configuration conf = null;
653 private long interval = 0;
654 private Sink sink = null;
655
656
657
658
659 private boolean regionServerMode = false;
660
661
662
663
664 private boolean zookeeperMode = false;
665
666
667
668
669
670
671 private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
672
673 public static final String HBASE_CANARY_REGIONSERVER_ALL_REGIONS
674 = "hbase.canary.regionserver_all_regions";
675
676 public static final String HBASE_CANARY_REGION_WRITE_SNIFFING
677 = "hbase.canary.region.write.sniffing";
678 public static final String HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT
679 = "hbase.canary.region.write.table.timeout";
680 public static final String HBASE_CANARY_REGION_WRITE_TABLE_NAME
681 = "hbase.canary.region.write.table.name";
682 public static final String HBASE_CANARY_REGION_READ_TABLE_TIMEOUT
683 = "hbase.canary.region.read.table.timeout";
684
685 public static final String HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES
686 = "hbase.canary.zookeeper.permitted.failures";
687
688 public static final String HBASE_CANARY_USE_REGEX = "hbase.canary.use.regex";
689 public static final String HBASE_CANARY_TIMEOUT = "hbase.canary.timeout";
690 public static final String HBASE_CANARY_FAIL_ON_ERROR = "hbase.canary.fail.on.error";
691
692
693 private ExecutorService executor;
694
695 public CanaryTool() {
696 this(new ScheduledThreadPoolExecutor(1));
697 }
698
699 public CanaryTool(ExecutorService executor) {
700 this(executor, null);
701 }
702
703 @InterfaceAudience.Private
704 CanaryTool(ExecutorService executor, Sink sink) {
705 this.executor = executor;
706 this.sink = sink;
707 }
708
709 CanaryTool(Configuration conf, ExecutorService executor) {
710 this(conf, executor, null);
711 }
712
713 CanaryTool(Configuration conf, ExecutorService executor, Sink sink) {
714 this(executor, sink);
715 setConf(conf);
716 }
717
718 @Override
719 public Configuration getConf() {
720 return conf;
721 }
722
723 @Override
724 public void setConf(Configuration conf) {
725 if (conf == null) {
726 conf = HBaseConfiguration.create();
727 }
728 this.conf = conf;
729 }
730
731 private int parseArgs(String[] args) {
732 int index = -1;
733 long permittedFailures = 0;
734 boolean regionServerAllRegions = false, writeSniffing = false;
735 String readTableTimeoutsStr = null;
736
737
738 for (int i = 0; i < args.length; i++) {
739 String cmd = args[i];
740
741 if (cmd.startsWith("-")) {
742 if (index >= 0) {
743
744 System.err.println("Invalid command line options");
745 printUsageAndExit();
746 }
747
748 if (cmd.equals("-help") || cmd.equals("-h")) {
749
750 printUsageAndExit();
751 } else if (cmd.equals("-daemon") && interval == 0) {
752
753 interval = DEFAULT_INTERVAL;
754 } else if (cmd.equals("-interval")) {
755
756 i++;
757
758 if (i == args.length) {
759 System.err.println("-interval takes a numeric seconds value argument.");
760 printUsageAndExit();
761 }
762
763 try {
764 interval = Long.parseLong(args[i]) * 1000;
765 } catch (NumberFormatException e) {
766 System.err.println("-interval needs a numeric value argument.");
767 printUsageAndExit();
768 }
769 } else if (cmd.equals("-zookeeper")) {
770 this.zookeeperMode = true;
771 } else if(cmd.equals("-regionserver")) {
772 this.regionServerMode = true;
773 } else if(cmd.equals("-allRegions")) {
774 conf.setBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, true);
775 regionServerAllRegions = true;
776 } else if(cmd.equals("-writeSniffing")) {
777 writeSniffing = true;
778 conf.setBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, true);
779 } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
780 conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
781 } else if (cmd.equals("-e")) {
782 conf.setBoolean(HBASE_CANARY_USE_REGEX, true);
783 } else if (cmd.equals("-t")) {
784 i++;
785
786 if (i == args.length) {
787 System.err.println("-t takes a numeric milliseconds value argument.");
788 printUsageAndExit();
789 }
790 long timeout = 0;
791 try {
792 timeout = Long.parseLong(args[i]);
793 } catch (NumberFormatException e) {
794 System.err.println("-t takes a numeric milliseconds value argument.");
795 printUsageAndExit();
796 }
797 conf.setLong(HBASE_CANARY_TIMEOUT, timeout);
798 } else if(cmd.equals("-writeTableTimeout")) {
799 i++;
800
801 if (i == args.length) {
802 System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
803 printUsageAndExit();
804 }
805 long configuredWriteTableTimeout = 0;
806 try {
807 configuredWriteTableTimeout = Long.parseLong(args[i]);
808 } catch (NumberFormatException e) {
809 System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
810 printUsageAndExit();
811 }
812 conf.setLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, configuredWriteTableTimeout);
813 } else if (cmd.equals("-writeTable")) {
814 i++;
815
816 if (i == args.length) {
817 System.err.println("-writeTable takes a string tablename value argument.");
818 printUsageAndExit();
819 }
820 conf.set(HBASE_CANARY_REGION_WRITE_TABLE_NAME, args[i]);
821 } else if (cmd.equals("-f")) {
822 i++;
823
824 if (i == args.length) {
825 System.err
826 .println("-f needs a boolean value argument (true|false).");
827 printUsageAndExit();
828 }
829
830 conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, Boolean.parseBoolean(args[i]));
831 } else if (cmd.equals("-readTableTimeouts")) {
832 i++;
833
834 if (i == args.length) {
835 System.err.println("-readTableTimeouts needs a comma-separated list of read " +
836 "millisecond timeouts per table (without spaces).");
837 printUsageAndExit();
838 }
839 readTableTimeoutsStr = args[i];
840 conf.set(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT, readTableTimeoutsStr);
841 } else if (cmd.equals("-permittedZookeeperFailures")) {
842 i++;
843
844 if (i == args.length) {
845 System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
846 printUsageAndExit();
847 }
848 try {
849 permittedFailures = Long.parseLong(args[i]);
850 } catch (NumberFormatException e) {
851 System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
852 printUsageAndExit();
853 }
854 conf.setLong(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, permittedFailures);
855 } else {
856
857 System.err.println(cmd + " options is invalid.");
858 printUsageAndExit();
859 }
860 } else if (index < 0) {
861
862 index = i;
863 }
864 }
865 if (regionServerAllRegions && !this.regionServerMode) {
866 System.err.println("-allRegions can only be specified in regionserver mode.");
867 printUsageAndExit();
868 }
869 if (this.zookeeperMode) {
870 if (this.regionServerMode || regionServerAllRegions || writeSniffing) {
871 System.err.println("-zookeeper is exclusive and cannot be combined with "
872 + "other modes.");
873 printUsageAndExit();
874 }
875 }
876 if (permittedFailures != 0 && !this.zookeeperMode) {
877 System.err.println("-permittedZookeeperFailures requires -zookeeper mode.");
878 printUsageAndExit();
879 }
880 if (readTableTimeoutsStr != null && (this.regionServerMode || this.zookeeperMode)) {
881 System.err.println("-readTableTimeouts can only be configured in region mode.");
882 printUsageAndExit();
883 }
884 return index;
885 }
886
887 @Override
888 public int run(String[] args) throws Exception {
889 int index = parseArgs(args);
890 String[] monitorTargets = null;
891
892 if (index >= 0) {
893 int length = args.length - index;
894 monitorTargets = new String[length];
895 System.arraycopy(args, index, monitorTargets, 0, length);
896 }
897
898 if (zookeeperMode) {
899 return checkZooKeeper();
900 } else if (regionServerMode) {
901 return checkRegionServers(monitorTargets);
902 } else {
903 return checkRegions(monitorTargets);
904 }
905 }
906
907 private int runMonitor(String[] monitorTargets) throws Exception {
908 ChoreService choreService = null;
909
910
911
912
913 final ScheduledChore authChore = AuthUtil.getAuthChore(conf);
914 if (authChore != null) {
915 choreService = new ChoreService("CANARY_TOOL");
916 choreService.scheduleChore(authChore);
917 }
918
919
920 Monitor monitor = null;
921 Thread monitorThread;
922 long startTime = 0;
923 long currentTimeLength = 0;
924 boolean failOnError = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
925 long timeout = conf.getLong(HBASE_CANARY_TIMEOUT, DEFAULT_TIMEOUT);
926
927 try (Connection connection = ConnectionFactory.createConnection(this.conf)) {
928 do {
929
930 try {
931 monitor = this.newMonitor(connection, monitorTargets);
932 monitorThread = new Thread(monitor, "CanaryMonitor-" + System.currentTimeMillis());
933 startTime = System.currentTimeMillis();
934 monitorThread.start();
935 while (!monitor.isDone()) {
936
937 Thread.sleep(1000);
938
939 if (failOnError && monitor.hasError()) {
940 monitorThread.interrupt();
941 if (monitor.initialized) {
942 return monitor.errorCode;
943 } else {
944 return INIT_ERROR_EXIT_CODE;
945 }
946 }
947 currentTimeLength = System.currentTimeMillis() - startTime;
948 if (currentTimeLength > timeout) {
949 LOG.error("The monitor is running too long (" + currentTimeLength
950 + ") after timeout limit:" + timeout
951 + " will be killed itself !!");
952 if (monitor.initialized) {
953 return TIMEOUT_ERROR_EXIT_CODE;
954 } else {
955 return INIT_ERROR_EXIT_CODE;
956 }
957 }
958 }
959
960 if (failOnError && monitor.finalCheckForErrors()) {
961 monitorThread.interrupt();
962 return monitor.errorCode;
963 }
964 } finally {
965 if (monitor != null) monitor.close();
966 }
967
968 Thread.sleep(interval);
969 } while (interval > 0);
970 }
971
972 if (choreService != null) {
973 choreService.shutdown();
974 }
975 return monitor.errorCode;
976 }
977
978 @Override
979 public Map<String, String> getReadFailures() {
980 return sink.getReadFailures();
981 }
982
983 @Override
984 public Map<String, String> getWriteFailures() {
985 return sink.getWriteFailures();
986 }
987
988 private void printUsageAndExit() {
989 System.err.println(
990 "Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
991 System.err.println("Where [OPTIONS] are:");
992 System.err.println(" -h,-help show this help and exit.");
993 System.err.println(" -regionserver set 'regionserver mode'; gets row from random region on " +
994 "server");
995 System.err.println(" -allRegions get from ALL regions when 'regionserver mode', not just " +
996 "random one.");
997 System.err.println(" -zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on " +
998 "each ensemble member");
999 System.err.println(" -daemon continuous check at defined intervals.");
1000 System.err.println(" -interval <N> interval between checks in seconds");
1001 System.err.println(" -e consider table/regionserver argument as regular " +
1002 "expression");
1003 System.err.println(" -f <B> exit on first error; default=true");
1004 System.err.println(" -failureAsError treat read/write failure as error");
1005 System.err.println(" -t <N> timeout for canary-test run; default=600000ms");
1006 System.err.println(" -writeSniffing enable write sniffing");
1007 System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary");
1008 System.err.println(" -writeTableTimeout <N> timeout for writeTable; default=600000ms");
1009 System.err.println(" -readTableTimeouts <tableName>=<read timeout>," +
1010 "<tableName>=<read timeout>,...");
1011 System.err.println(" comma-separated list of table read timeouts " +
1012 "(no spaces);");
1013 System.err.println(" logs 'ERROR' if takes longer. default=600000ms");
1014 System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures attempting to ");
1015 System.err.println(" connect to individual zookeeper nodes in ensemble");
1016 System.err.println("");
1017 System.err.println(" -D<configProperty>=<value> to assign or override configuration params");
1018 System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable " +
1019 "raw scan; default=false");
1020 System.err.println("");
1021 System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
1022 "zookeeper.");
1023 System.err.println("To sniff/probe all regions, pass no arguments.");
1024 System.err.println("To sniff/probe all regions of a table, pass tablename.");
1025 System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
1026 System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
1027 System.exit(USAGE_EXIT_CODE);
1028 }
1029
1030 Sink getSink(Configuration configuration, Class clazz) {
1031
1032 return this.sink != null? this.sink:
1033 (Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
1034 clazz, Sink.class));
1035 }
1036
1037
1038
1039
1040
1041 public static class RegionTaskResult {
1042 private HRegionInfo region;
1043 private TableName tableName;
1044 private ServerName serverName;
1045 private HColumnDescriptor column;
1046 private AtomicLong readLatency = null;
1047 private AtomicLong writeLatency = null;
1048 private boolean readSuccess = false;
1049 private boolean writeSuccess = false;
1050
1051 public RegionTaskResult(HRegionInfo region, TableName tableName, ServerName serverName, HColumnDescriptor column) {
1052 this.region = region;
1053 this.tableName = tableName;
1054 this.serverName = serverName;
1055 this.column = column;
1056 }
1057
1058 public HRegionInfo getRegionInfo() {
1059 return this.region;
1060 }
1061
1062 public String getRegionNameAsString() {
1063 return this.region.getRegionNameAsString();
1064 }
1065
1066 public TableName getTableName() {
1067 return this.tableName;
1068 }
1069
1070 public String getTableNameAsString() {
1071 return this.tableName.getNameAsString();
1072 }
1073
1074 public ServerName getServerName() {
1075 return this.serverName;
1076 }
1077
1078 public String getServerNameAsString() {
1079 return this.serverName.getServerName();
1080 }
1081
1082 public HColumnDescriptor getColumnFamily() {
1083 return this.column;
1084 }
1085
1086 public String getColumnFamilyNameAsString() {
1087 return this.column.getNameAsString();
1088 }
1089
1090 public long getReadLatency() {
1091 if (this.readLatency == null) {
1092 return -1;
1093 }
1094 return this.readLatency.get();
1095 }
1096
1097 public void setReadLatency(long readLatency) {
1098 if (this.readLatency != null) {
1099 this.readLatency.set(readLatency);
1100 } else {
1101 this.readLatency = new AtomicLong(readLatency);
1102 }
1103 }
1104
1105 public long getWriteLatency() {
1106 if (this.writeLatency == null) {
1107 return -1;
1108 }
1109 return this.writeLatency.get();
1110 }
1111
1112 public void setWriteLatency(long writeLatency) {
1113 if (this.writeLatency != null) {
1114 this.writeLatency.set(writeLatency);
1115 } else {
1116 this.writeLatency = new AtomicLong(writeLatency);
1117 }
1118 }
1119
1120 public boolean isReadSuccess() {
1121 return this.readSuccess;
1122 }
1123
1124 public void setReadSuccess() {
1125 this.readSuccess = true;
1126 }
1127
1128 public boolean isWriteSuccess() {
1129 return this.writeSuccess;
1130 }
1131
1132 public void setWriteSuccess() {
1133 this.writeSuccess = true;
1134 }
1135 }
1136
1137
1138
1139
1140
1141
1142 private Monitor newMonitor(final Connection connection, String[] monitorTargets) {
1143 Monitor monitor;
1144 boolean useRegExp = conf.getBoolean(HBASE_CANARY_USE_REGEX, false);
1145 boolean regionServerAllRegions
1146 = conf.getBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, false);
1147 boolean failOnError
1148 = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
1149 int permittedFailures
1150 = conf.getInt(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, 0);
1151 boolean writeSniffing
1152 = conf.getBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, false);
1153 String writeTableName = conf.get(HBASE_CANARY_REGION_WRITE_TABLE_NAME,
1154 DEFAULT_WRITE_TABLE_NAME.getNameAsString());
1155 long configuredWriteTableTimeout
1156 = conf.getLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, DEFAULT_TIMEOUT);
1157
1158 if (this.regionServerMode) {
1159 monitor =
1160 new RegionServerMonitor(connection, monitorTargets, useRegExp,
1161 getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
1162 this.executor, regionServerAllRegions,
1163 failOnError, permittedFailures);
1164
1165 } else if (this.zookeeperMode) {
1166 monitor =
1167 new ZookeeperMonitor(connection, monitorTargets, useRegExp,
1168 getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
1169 this.executor, failOnError, permittedFailures);
1170 } else {
1171 monitor =
1172 new RegionMonitor(connection, monitorTargets, useRegExp,
1173 getSink(connection.getConfiguration(), RegionStdOutSink.class),
1174 this.executor, writeSniffing,
1175 TableName.valueOf(writeTableName), failOnError, configuredReadTableTimeouts,
1176 configuredWriteTableTimeout, permittedFailures);
1177 }
1178 return monitor;
1179 }
1180
1181 private void populateReadTableTimeoutsMap(String configuredReadTableTimeoutsStr) {
1182 String[] tableTimeouts = configuredReadTableTimeoutsStr.split(",");
1183 for (String tT : tableTimeouts) {
1184 String[] nameTimeout = tT.split("=");
1185 if (nameTimeout.length < 2) {
1186 throw new IllegalArgumentException("Each -readTableTimeouts argument must be of the form " +
1187 "<tableName>=<read timeout> (without spaces).");
1188 }
1189 long timeoutVal;
1190 try {
1191 timeoutVal = Long.parseLong(nameTimeout[1]);
1192 } catch (NumberFormatException e) {
1193 throw new IllegalArgumentException("-readTableTimeouts read timeout for each table" +
1194 " must be a numeric value argument.");
1195 }
1196 configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal);
1197 }
1198 }
1199
1200
1201
1202 public static abstract class Monitor implements Runnable, Closeable {
1203 protected Connection connection;
1204 protected Admin admin;
1205
1206
1207
1208
1209 protected String[] targets;
1210 protected boolean useRegExp;
1211 protected boolean treatFailureAsError;
1212 protected boolean initialized = false;
1213
1214 protected boolean done = false;
1215 protected int errorCode = 0;
1216 protected long allowedFailures = 0;
1217 protected Sink sink;
1218 protected ExecutorService executor;
1219
1220 public boolean isDone() {
1221 return done;
1222 }
1223
1224 public boolean hasError() {
1225 return errorCode != 0;
1226 }
1227
1228 public boolean finalCheckForErrors() {
1229 if (errorCode != 0) {
1230 return true;
1231 }
1232 if (treatFailureAsError &&
1233 (sink.getReadFailureCount() > allowedFailures || sink.getWriteFailureCount() > allowedFailures)) {
1234 LOG.error("Too many failures detected, treating failure as error, failing the Canary.");
1235 errorCode = FAILURE_EXIT_CODE;
1236 return true;
1237 }
1238 return false;
1239 }
1240
1241 @Override
1242 public void close() throws IOException {
1243 if (this.admin != null) this.admin.close();
1244 }
1245
1246 protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
1247 ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
1248 if (null == connection) throw new IllegalArgumentException("connection shall not be null");
1249
1250 this.connection = connection;
1251 this.targets = monitorTargets;
1252 this.useRegExp = useRegExp;
1253 this.treatFailureAsError = treatFailureAsError;
1254 this.sink = sink;
1255 this.executor = executor;
1256 this.allowedFailures = allowedFailures;
1257 }
1258
1259 @Override
1260 public abstract void run();
1261
1262 protected boolean initAdmin() {
1263 if (null == this.admin) {
1264 try {
1265 this.admin = this.connection.getAdmin();
1266 } catch (Exception e) {
1267 LOG.error("Initial HBaseAdmin failed...", e);
1268 this.errorCode = INIT_ERROR_EXIT_CODE;
1269 }
1270 } else if (admin.isAborted()) {
1271 LOG.error("HBaseAdmin aborted");
1272 this.errorCode = INIT_ERROR_EXIT_CODE;
1273 }
1274 return !this.hasError();
1275 }
1276 }
1277
1278
1279
1280
1281 private static class RegionMonitor extends Monitor {
1282
1283 private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
1284
1285 private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
1286
1287 private long lastCheckTime = -1;
1288 private boolean writeSniffing;
1289 private TableName writeTableName;
1290 private int writeDataTTL;
1291 private float regionsLowerLimit;
1292 private float regionsUpperLimit;
1293 private int checkPeriod;
1294 private boolean rawScanEnabled;
1295
1296
1297
1298
1299
1300 private HashMap<String, Long> configuredReadTableTimeouts;
1301
1302 private long configuredWriteTableTimeout;
1303
1304 public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1305 Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
1306 boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
1307 long configuredWriteTableTimeout,
1308 long allowedFailures) {
1309 super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
1310 allowedFailures);
1311 Configuration conf = connection.getConfiguration();
1312 this.writeSniffing = writeSniffing;
1313 this.writeTableName = writeTableName;
1314 this.writeDataTTL =
1315 conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
1316 this.regionsLowerLimit =
1317 conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
1318 this.regionsUpperLimit =
1319 conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
1320 this.checkPeriod =
1321 conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
1322 DEFAULT_WRITE_TABLE_CHECK_PERIOD);
1323 this.rawScanEnabled = conf.getBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, false);
1324 this.configuredReadTableTimeouts = new HashMap<>(configuredReadTableTimeouts);
1325 this.configuredWriteTableTimeout = configuredWriteTableTimeout;
1326 }
1327
1328 private RegionStdOutSink getSink() {
1329 if (!(sink instanceof RegionStdOutSink)) {
1330 throw new RuntimeException("Can only write to Region sink");
1331 }
1332 return ((RegionStdOutSink) sink);
1333 }
1334
1335 @Override
1336 public void run() {
1337 if (this.initAdmin()) {
1338 try {
1339 List<Future<Void>> taskFutures = new LinkedList<>();
1340 RegionStdOutSink regionSink = this.getSink();
1341 if (this.targets != null && this.targets.length > 0) {
1342 String[] tables = generateMonitorTables(this.targets);
1343
1344
1345 if (!new HashSet<>(Arrays.asList(tables)).
1346 containsAll(this.configuredReadTableTimeouts.keySet())) {
1347 LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
1348 "passed via command line.");
1349 this.errorCode = USAGE_EXIT_CODE;
1350 return;
1351 }
1352 this.initialized = true;
1353 for (String table : tables) {
1354 AtomicLong readLatency = regionSink.initializeAndGetReadLatencyForTable(table);
1355 taskFutures.addAll(CanaryTool.sniff(admin, regionSink, table, executor, TaskType.READ,
1356 this.rawScanEnabled, readLatency));
1357 }
1358 } else {
1359 taskFutures.addAll(sniff(TaskType.READ, regionSink));
1360 }
1361
1362 if (writeSniffing) {
1363 if (EnvironmentEdgeManager.currentTime() - lastCheckTime > checkPeriod) {
1364 try {
1365 checkWriteTableDistribution();
1366 } catch (IOException e) {
1367 LOG.error("Check canary table distribution failed!", e);
1368 }
1369 lastCheckTime = EnvironmentEdgeManager.currentTime();
1370 }
1371
1372 regionSink.initializeWriteLatency();
1373 AtomicLong writeTableLatency = regionSink.getWriteLatency();
1374 taskFutures.addAll(CanaryTool.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
1375 executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
1376 }
1377
1378 for (Future<Void> future : taskFutures) {
1379 try {
1380 future.get();
1381 } catch (ExecutionException e) {
1382 LOG.error("Sniff region failed!", e);
1383 }
1384 }
1385 Map<String, AtomicLong> actualReadTableLatency = regionSink.getReadLatencyMap();
1386 for (Map.Entry<String, Long> entry : configuredReadTableTimeouts.entrySet()) {
1387 String tableName = entry.getKey();
1388 if (actualReadTableLatency.containsKey(tableName)) {
1389 Long actual = actualReadTableLatency.get(tableName).longValue();
1390 Long configured = entry.getValue();
1391 if (actual > configured) {
1392 LOG.error("Read operation for {} took {}ms exceeded the configured read timeout." +
1393 "(Configured read timeout {}ms.", tableName, actual, configured);
1394 } else {
1395 LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
1396 tableName, actual, configured);
1397 }
1398 } else {
1399 LOG.error("Read operation for {} failed!", tableName);
1400 }
1401 }
1402 if (this.writeSniffing) {
1403 String writeTableStringName = this.writeTableName.getNameAsString();
1404 long actualWriteLatency = regionSink.getWriteLatency().longValue();
1405 LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
1406 writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
1407
1408 if (actualWriteLatency > this.configuredWriteTableTimeout) {
1409 LOG.error("Write operation for {} exceeded the configured write timeout.",
1410 writeTableStringName);
1411 }
1412 }
1413 } catch (Exception e) {
1414 LOG.error("Run regionMonitor failed", e);
1415 this.errorCode = ERROR_EXIT_CODE;
1416 } finally {
1417 this.done = true;
1418 }
1419 }
1420 this.done = true;
1421 }
1422
1423
1424
1425
1426 private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
1427 String[] returnTables = null;
1428
1429 if (this.useRegExp) {
1430 Pattern pattern = null;
1431 HTableDescriptor[] tds = null;
1432 Set<String> tmpTables = new TreeSet<>();
1433 try {
1434 LOG.debug(String.format("reading list of tables"));
1435 tds = this.admin.listTables(pattern);
1436 if (tds == null) {
1437 tds = new HTableDescriptor[] {};
1438 }
1439 for (String monitorTarget : monitorTargets) {
1440 pattern = Pattern.compile(monitorTarget);
1441 for (HTableDescriptor td : tds) {
1442 if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
1443 tmpTables.add(td.getTableName().getNameAsString());
1444 }
1445 }
1446 }
1447 } catch (IOException e) {
1448 LOG.error("Communicate with admin failed", e);
1449 throw e;
1450 }
1451
1452 if (tmpTables.size() > 0) {
1453 returnTables = tmpTables.toArray(new String[tmpTables.size()]);
1454 } else {
1455 String msg = "No HTable found, tablePattern:" + Arrays.toString(monitorTargets);
1456 LOG.error(msg);
1457 this.errorCode = INIT_ERROR_EXIT_CODE;
1458 throw new TableNotFoundException(msg);
1459 }
1460 } else {
1461 returnTables = monitorTargets;
1462 }
1463
1464 return returnTables;
1465 }
1466
1467
1468
1469
1470 private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink)
1471 throws Exception {
1472 LOG.debug("Reading list of tables");
1473 List<Future<Void>> taskFutures = new LinkedList<>();
1474 for (HTableDescriptor td: admin.listTables()) {
1475 if (admin.tableExists(td.getTableName()) && admin.isTableEnabled(td.getTableName()) &&
1476 (!td.getTableName().equals(writeTableName))) {
1477 AtomicLong readLatency =
1478 regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
1479 taskFutures.addAll(CanaryTool.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled,
1480 readLatency));
1481 }
1482 }
1483 return taskFutures;
1484 }
1485
1486 private void checkWriteTableDistribution() throws IOException {
1487 if (!admin.tableExists(writeTableName)) {
1488 int numberOfServers =
1489 admin.getClusterStatus().getServersSize();
1490 if (numberOfServers == 0) {
1491 throw new IllegalStateException("No live regionservers");
1492 }
1493 createWriteTable(numberOfServers);
1494 }
1495
1496 if (!admin.isTableEnabled(writeTableName)) {
1497 admin.enableTable(writeTableName);
1498 }
1499
1500 ClusterStatus status =
1501 admin.getClusterStatus();
1502 int numberOfServers = status.getServersSize();
1503 if (status.getServers().contains(status.getMaster())) {
1504 numberOfServers -= 1;
1505 }
1506
1507 List<HRegionLocation> locations;
1508 RegionLocator locator = connection.getRegionLocator(writeTableName);
1509 try {
1510 locations = locator.getAllRegionLocations();
1511 } finally {
1512 locator.close();
1513 }
1514 int numberOfRegions = locations.size();
1515 if (numberOfRegions < numberOfServers * regionsLowerLimit
1516 || numberOfRegions > numberOfServers * regionsUpperLimit) {
1517 admin.disableTable(writeTableName);
1518 admin.deleteTable(writeTableName);
1519 createWriteTable(numberOfServers);
1520 }
1521 HashSet<ServerName> serverSet = new HashSet<ServerName>();
1522 for (HRegionLocation location: locations) {
1523 serverSet.add(location.getServerName());
1524 }
1525 int numberOfCoveredServers = serverSet.size();
1526 if (numberOfCoveredServers < numberOfServers) {
1527 admin.balancer();
1528 }
1529 }
1530
1531 private void createWriteTable(int numberOfServers) throws IOException {
1532 int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
1533 LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
1534 "(current lower limit of regions per server is {} and you can change it with config {}).",
1535 numberOfServers, numberOfRegions, regionsLowerLimit,
1536 HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
1537 HTableDescriptor desc = new HTableDescriptor(writeTableName);
1538 HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
1539 family.setMaxVersions(1);
1540 family.setTimeToLive(writeDataTTL);
1541
1542 desc.addFamily(family);
1543 byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
1544 admin.createTable(desc, splits);
1545 }
1546 }
1547
1548
1549
1550
1551
1552 private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
1553 ExecutorService executor, TaskType taskType, boolean rawScanEnabled, AtomicLong readLatency)
1554 throws Exception {
1555 LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
1556 if (admin.isTableEnabled(TableName.valueOf(tableName))) {
1557 return CanaryTool.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
1558 executor, taskType, rawScanEnabled, readLatency);
1559 } else {
1560 LOG.warn("Table {} is not enabled", tableName);
1561 }
1562 return new LinkedList<>();
1563 }
1564
1565
1566
1567
1568 private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
1569 HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
1570 boolean rawScanEnabled, AtomicLong rwLatency) throws Exception {
1571 LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
1572 try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
1573 List<RegionTask> tasks = new ArrayList<>();
1574 try (RegionLocator regionLocator =
1575 admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
1576 for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
1577 ServerName rs = location.getServerName();
1578 HRegionInfo region = location.getRegionInfo();
1579 tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
1580 taskType, rawScanEnabled, rwLatency));
1581 Map<String, List<RegionTaskResult>> regionMap = ((RegionStdOutSink) sink).getRegionMap();
1582 regionMap.put(region.getRegionNameAsString(), new ArrayList<RegionTaskResult>());
1583 }
1584 return executor.invokeAll(tasks);
1585 }
1586 } catch (TableNotFoundException e) {
1587 return Collections.EMPTY_LIST;
1588 }
1589 }
1590
1591
1592 private static class ZookeeperMonitor extends Monitor {
1593 private List<String> hosts;
1594 private final String znode;
1595 private final int timeout;
1596
1597 protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1598 Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
1599 super(connection, monitorTargets, useRegExp,
1600 sink, executor, treatFailureAsError, allowedFailures);
1601 Configuration configuration = connection.getConfiguration();
1602 znode =
1603 configuration.get(ZOOKEEPER_ZNODE_PARENT,
1604 DEFAULT_ZOOKEEPER_ZNODE_PARENT);
1605 timeout = configuration
1606 .getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
1607 ConnectStringParser parser =
1608 new ConnectStringParser(ZKConfig.getZKQuorumServersString(configuration));
1609 hosts = Lists.newArrayList();
1610 for (InetSocketAddress server : parser.getServerAddresses()) {
1611 hosts.add(server.toString());
1612 }
1613 if (allowedFailures > (hosts.size() - 1) / 2) {
1614 LOG.warn("Confirm allowable number of failed ZooKeeper nodes, as quorum will " +
1615 "already be lost. Setting of {} failures is unexpected for {} ensemble size.",
1616 allowedFailures, hosts.size());
1617 }
1618 }
1619
1620 @Override public void run() {
1621 List<ZookeeperTask> tasks = Lists.newArrayList();
1622 ZookeeperStdOutSink zkSink = null;
1623 try {
1624 zkSink = this.getSink();
1625 } catch (RuntimeException e) {
1626 LOG.error("Run ZooKeeperMonitor failed!", e);
1627 this.errorCode = ERROR_EXIT_CODE;
1628 }
1629 this.initialized = true;
1630 for (final String host : hosts) {
1631 tasks.add(new ZookeeperTask(connection, host, znode, timeout, zkSink));
1632 }
1633 try {
1634 for (Future<Void> future : this.executor.invokeAll(tasks)) {
1635 try {
1636 future.get();
1637 } catch (ExecutionException e) {
1638 LOG.error("Sniff zookeeper failed!", e);
1639 this.errorCode = ERROR_EXIT_CODE;
1640 }
1641 }
1642 } catch (InterruptedException e) {
1643 this.errorCode = ERROR_EXIT_CODE;
1644 Thread.currentThread().interrupt();
1645 LOG.error("Sniff zookeeper interrupted!", e);
1646 }
1647 this.done = true;
1648 }
1649
1650 private ZookeeperStdOutSink getSink() {
1651 if (!(sink instanceof ZookeeperStdOutSink)) {
1652 throw new RuntimeException("Can only write to zookeeper sink");
1653 }
1654 return ((ZookeeperStdOutSink) sink);
1655 }
1656 }
1657
1658
1659
1660
1661
1662 private static class RegionServerMonitor extends Monitor {
1663 private boolean allRegions;
1664
1665 public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1666 Sink sink, ExecutorService executor, boolean allRegions,
1667 boolean treatFailureAsError, long allowedFailures) {
1668 super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
1669 allowedFailures);
1670 this.allRegions = allRegions;
1671 }
1672
1673 private RegionServerStdOutSink getSink() {
1674 if (!(sink instanceof RegionServerStdOutSink)) {
1675 throw new RuntimeException("Can only write to regionserver sink");
1676 }
1677 return ((RegionServerStdOutSink) sink);
1678 }
1679
1680 @Override
1681 public void run() {
1682 if (this.initAdmin() && this.checkNoTableNames()) {
1683 RegionServerStdOutSink regionServerSink = null;
1684 try {
1685 regionServerSink = this.getSink();
1686 } catch (RuntimeException e) {
1687 LOG.error("Run RegionServerMonitor failed!", e);
1688 this.errorCode = ERROR_EXIT_CODE;
1689 }
1690 Map<String, List<HRegionInfo>> rsAndRMap = this.filterRegionServerByName();
1691 this.initialized = true;
1692 this.monitorRegionServers(rsAndRMap, regionServerSink);
1693 }
1694 this.done = true;
1695 }
1696
1697 private boolean checkNoTableNames() {
1698 List<String> foundTableNames = new ArrayList<>();
1699 TableName[] tableNames = null;
1700 LOG.debug("Reading list of tables");
1701 try {
1702 tableNames = this.admin.listTableNames();
1703 } catch (IOException e) {
1704 LOG.error("Get listTableNames failed", e);
1705 this.errorCode = INIT_ERROR_EXIT_CODE;
1706 return false;
1707 }
1708
1709 if (this.targets == null || this.targets.length == 0) return true;
1710
1711 for (String target : this.targets) {
1712 for (TableName tableName : tableNames) {
1713 if (target.equals(tableName.getNameAsString())) {
1714 foundTableNames.add(target);
1715 }
1716 }
1717 }
1718
1719 if (foundTableNames.size() > 0) {
1720 System.err.println("Cannot pass a tablename when using the -regionserver " +
1721 "option, tablenames:" + foundTableNames.toString());
1722 this.errorCode = USAGE_EXIT_CODE;
1723 }
1724 return foundTableNames.isEmpty();
1725 }
1726
1727 private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap, RegionServerStdOutSink regionServerSink) {
1728 List<RegionServerTask> tasks = new ArrayList<>();
1729 Map<String, AtomicLong> successMap = new HashMap<>();
1730 Random rand = new Random();
1731 for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
1732 String serverName = entry.getKey();
1733 AtomicLong successes = new AtomicLong(0);
1734 successMap.put(serverName, successes);
1735 if (entry.getValue().isEmpty()) {
1736 LOG.error("Regionserver not serving any regions - {}", serverName);
1737 } else if (this.allRegions) {
1738 for (HRegionInfo region : entry.getValue()) {
1739 tasks.add(new RegionServerTask(this.connection,
1740 serverName,
1741 region,
1742 regionServerSink,
1743 successes));
1744 }
1745 } else {
1746
1747 HRegionInfo region = entry.getValue().get(rand.nextInt(entry.getValue().size()));
1748 tasks.add(new RegionServerTask(this.connection,
1749 serverName,
1750 region,
1751 regionServerSink,
1752 successes));
1753 }
1754 }
1755 try {
1756 for (Future<Void> future : this.executor.invokeAll(tasks)) {
1757 try {
1758 future.get();
1759 } catch (ExecutionException e) {
1760 LOG.error("Sniff regionserver failed!", e);
1761 this.errorCode = ERROR_EXIT_CODE;
1762 }
1763 }
1764 if (this.allRegions) {
1765 for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
1766 String serverName = entry.getKey();
1767 LOG.info("Successfully read {} regions out of {} on regionserver {}",
1768 successMap.get(serverName), entry.getValue().size(), serverName);
1769 }
1770 }
1771 } catch (InterruptedException e) {
1772 this.errorCode = ERROR_EXIT_CODE;
1773 LOG.error("Sniff regionserver interrupted!", e);
1774 }
1775 }
1776
1777 private Map<String, List<HRegionInfo>> filterRegionServerByName() {
1778 Map<String, List<HRegionInfo>> regionServerAndRegionsMap = this.getAllRegionServerByName();
1779 regionServerAndRegionsMap = this.doFilterRegionServerByName(regionServerAndRegionsMap);
1780 return regionServerAndRegionsMap;
1781 }
1782
1783 private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
1784 Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<>();
1785 try {
1786 LOG.debug("Reading list of tables and locations");
1787 HTableDescriptor[] tableDescs = this.admin.listTables();
1788 List<HRegionInfo> regions = null;
1789 for (HTableDescriptor tableDesc: tableDescs) {
1790 try (RegionLocator regionLocator =
1791 this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
1792 for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
1793 ServerName rs = location.getServerName();
1794 String rsName = rs.getHostname();
1795 HRegionInfo r = location.getRegionInfo();
1796 if (rsAndRMap.containsKey(rsName)) {
1797 regions = rsAndRMap.get(rsName);
1798 } else {
1799 regions = new ArrayList<>();
1800 rsAndRMap.put(rsName, regions);
1801 }
1802 regions.add(r);
1803 }
1804 }
1805 }
1806
1807
1808 for (ServerName rs: this.admin.getClusterStatus()
1809 .getServers()) {
1810 String rsName = rs.getHostname();
1811 if (!rsAndRMap.containsKey(rsName)) {
1812 rsAndRMap.put(rsName, Collections.<HRegionInfo> emptyList());
1813 }
1814 }
1815 } catch (IOException e) {
1816 LOG.error("Get HTables info failed", e);
1817 this.errorCode = INIT_ERROR_EXIT_CODE;
1818 }
1819 return rsAndRMap;
1820 }
1821
1822 private Map<String, List<HRegionInfo>> doFilterRegionServerByName(
1823 Map<String, List<HRegionInfo>> fullRsAndRMap) {
1824
1825 Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
1826
1827 if (this.targets != null && this.targets.length > 0) {
1828 filteredRsAndRMap = new HashMap<>();
1829 Pattern pattern = null;
1830 Matcher matcher = null;
1831 boolean regExpFound = false;
1832 for (String rsName : this.targets) {
1833 if (this.useRegExp) {
1834 regExpFound = false;
1835 pattern = Pattern.compile(rsName);
1836 for (Map.Entry<String, List<HRegionInfo>> entry : fullRsAndRMap.entrySet()) {
1837 matcher = pattern.matcher(entry.getKey());
1838 if (matcher.matches()) {
1839 filteredRsAndRMap.put(entry.getKey(), entry.getValue());
1840 regExpFound = true;
1841 }
1842 }
1843 if (!regExpFound) {
1844 LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
1845 }
1846 } else {
1847 if (fullRsAndRMap.containsKey(rsName)) {
1848 filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
1849 } else {
1850 LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
1851 }
1852 }
1853 }
1854 } else {
1855 filteredRsAndRMap = fullRsAndRMap;
1856 }
1857 return filteredRsAndRMap;
1858 }
1859 }
1860
1861 public static void main(String[] args) throws Exception {
1862 final Configuration conf = HBaseConfiguration.create();
1863
1864 int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
1865 LOG.info("Execution thread count={}", numThreads);
1866
1867 int exitCode;
1868 ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
1869 try {
1870 exitCode = ToolRunner.run(conf, new CanaryTool(executor), args);
1871 } finally {
1872 executor.shutdown();
1873 }
1874 System.exit(exitCode);
1875 }
1876 }