1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.replication.regionserver;
20
21 import com.google.common.cache.Cache;
22 import com.google.common.cache.CacheBuilder;
23 import com.google.common.collect.Lists;
24 import com.google.protobuf.ServiceException;
25
26 import java.io.IOException;
27 import java.io.InterruptedIOException;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.concurrent.Callable;
32 import java.util.concurrent.ExecutionException;
33 import java.util.concurrent.ExecutorService;
34 import java.util.concurrent.Future;
35 import java.util.concurrent.LinkedBlockingQueue;
36 import java.util.concurrent.ThreadPoolExecutor;
37 import java.util.concurrent.TimeUnit;
38 import java.util.concurrent.atomic.AtomicLong;
39
40 import org.apache.commons.logging.Log;
41 import org.apache.commons.logging.LogFactory;
42 import org.apache.hadoop.conf.Configuration;
43 import org.apache.hadoop.fs.Path;
44 import org.apache.hadoop.hbase.CellScanner;
45 import org.apache.hadoop.hbase.HBaseConfiguration;
46 import org.apache.hadoop.hbase.HBaseIOException;
47 import org.apache.hadoop.hbase.HConstants;
48 import org.apache.hadoop.hbase.HRegionInfo;
49 import org.apache.hadoop.hbase.HRegionLocation;
50 import org.apache.hadoop.hbase.HTableDescriptor;
51 import org.apache.hadoop.hbase.RegionLocations;
52 import org.apache.hadoop.hbase.TableDescriptors;
53 import org.apache.hadoop.hbase.TableName;
54 import org.apache.hadoop.hbase.TableNotFoundException;
55 import org.apache.hadoop.hbase.classification.InterfaceAudience;
56 import org.apache.hadoop.hbase.client.ClusterConnection;
57 import org.apache.hadoop.hbase.client.ConnectionFactory;
58 import org.apache.hadoop.hbase.client.RegionAdminServiceCallable;
59 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
60 import org.apache.hadoop.hbase.client.RetryingCallable;
61 import org.apache.hadoop.hbase.client.RpcRetryingCallerFactory;
62 import org.apache.hadoop.hbase.ipc.HBaseRpcController;
63 import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
64 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
65 import org.apache.hadoop.hbase.protobuf.ReplicationProtbufUtil;
66 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
67 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ReplicateWALEntryResponse;
68 import org.apache.hadoop.hbase.replication.BaseWALEntryFilter;
69 import org.apache.hadoop.hbase.replication.ChainWALEntryFilter;
70 import org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint;
71 import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
72 import org.apache.hadoop.hbase.replication.WALEntryFilter;
73 import org.apache.hadoop.hbase.util.Bytes;
74 import org.apache.hadoop.hbase.util.Pair;
75 import org.apache.hadoop.hbase.util.Threads;
76 import org.apache.hadoop.hbase.wal.WAL.Entry;
77 import org.apache.hadoop.hbase.wal.WALSplitter.EntryBuffers;
78 import org.apache.hadoop.hbase.wal.WALSplitter.OutputSink;
79 import org.apache.hadoop.hbase.wal.WALSplitter.PipelineController;
80 import org.apache.hadoop.hbase.wal.WALSplitter.RegionEntryBuffer;
81 import org.apache.hadoop.hbase.wal.WALSplitter.SinkWriter;
82 import org.apache.hadoop.util.StringUtils;
83
84
85
86
87
88 @InterfaceAudience.Private
89 public class RegionReplicaReplicationEndpoint extends HBaseReplicationEndpoint {
90
91 private static final Log LOG = LogFactory.getLog(RegionReplicaReplicationEndpoint.class);
92
93
94 private static String CLIENT_RETRIES_NUMBER
95 = "hbase.region.replica.replication.client.retries.number";
96
97 private Configuration conf;
98 private ClusterConnection connection;
99 private TableDescriptors tableDescriptors;
100
101
102 private PipelineController controller;
103 private RegionReplicaOutputSink outputSink;
104 private EntryBuffers entryBuffers;
105
106
107 private int numWriterThreads;
108
109 private int operationTimeout;
110
111 private ExecutorService pool;
112
113
114
115
116
117 private static class SkipReplayedEditsFilter extends BaseWALEntryFilter {
118 @Override
119 public Entry filter(Entry entry) {
120
121 if (entry.getKey().getOrigLogSeqNum() > 0) {
122 return null;
123 }
124 return entry;
125 }
126 }
127
128 @Override
129 public WALEntryFilter getWALEntryfilter() {
130 WALEntryFilter superFilter = super.getWALEntryfilter();
131 WALEntryFilter skipReplayedEditsFilter = getSkipReplayedEditsFilter();
132
133 if (superFilter == null) {
134 return skipReplayedEditsFilter;
135 }
136
137 if (skipReplayedEditsFilter == null) {
138 return superFilter;
139 }
140
141 ArrayList<WALEntryFilter> filters = Lists.newArrayList();
142 filters.add(superFilter);
143 filters.add(skipReplayedEditsFilter);
144 return new ChainWALEntryFilter(filters);
145 }
146
147 protected WALEntryFilter getSkipReplayedEditsFilter() {
148 return new SkipReplayedEditsFilter();
149 }
150
151 @Override
152 public void init(Context context) throws IOException {
153 super.init(context);
154
155 this.conf = HBaseConfiguration.create(context.getConfiguration());
156 this.tableDescriptors = context.getTableDescriptors();
157
158
159
160
161 int defaultNumRetries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
162 HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
163 if (defaultNumRetries > 10) {
164 int mult = conf.getInt("hbase.client.serverside.retries.multiplier", 10);
165 defaultNumRetries = defaultNumRetries / mult;
166 }
167
168 conf.setInt("hbase.client.serverside.retries.multiplier", 1);
169 int numRetries = conf.getInt(CLIENT_RETRIES_NUMBER, defaultNumRetries);
170 conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, numRetries);
171
172 this.numWriterThreads = this.conf.getInt(
173 "hbase.region.replica.replication.writer.threads", 3);
174 controller = new PipelineController();
175 entryBuffers = new EntryBuffers(controller,
176 this.conf.getInt("hbase.region.replica.replication.buffersize",
177 128*1024*1024));
178
179
180 this.operationTimeout = conf.getInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,
181 HConstants.DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT);
182 }
183
184 @Override
185 protected void doStart() {
186 try {
187 connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);
188 this.pool = getDefaultThreadPool(conf);
189 outputSink = new RegionReplicaOutputSink(controller, tableDescriptors, entryBuffers,
190 connection, pool, numWriterThreads, operationTimeout);
191 outputSink.startWriterThreads();
192 super.doStart();
193 } catch (IOException ex) {
194 LOG.warn("Received exception while creating connection :" + ex);
195 notifyFailed(ex);
196 }
197 }
198
199 @Override
200 protected void doStop() {
201 if (outputSink != null) {
202 try {
203 outputSink.finishWritingAndClose();
204 } catch (IOException ex) {
205 LOG.warn("Got exception while trying to close OutputSink");
206 LOG.warn(ex);
207 }
208 }
209 if (this.pool != null) {
210 this.pool.shutdownNow();
211 try {
212
213 boolean shutdown = this.pool.awaitTermination(10000, TimeUnit.MILLISECONDS);
214 if (!shutdown) {
215 LOG.warn("Failed to shutdown the thread pool after 10 seconds");
216 }
217 } catch (InterruptedException e) {
218 LOG.warn("Got interrupted while waiting for the thread pool to shut down" + e);
219 }
220 }
221 if (connection != null) {
222 try {
223 connection.close();
224 } catch (IOException ex) {
225 LOG.warn("Got exception closing connection :" + ex);
226 }
227 }
228 super.doStop();
229 }
230
231
232
233
234
235 private ExecutorService getDefaultThreadPool(Configuration conf) {
236 int maxThreads = conf.getInt("hbase.region.replica.replication.threads.max", 256);
237 int coreThreads = conf.getInt("hbase.region.replica.replication.threads.core", 16);
238 if (maxThreads == 0) {
239 maxThreads = Runtime.getRuntime().availableProcessors() * 8;
240 }
241 if (coreThreads == 0) {
242 coreThreads = Runtime.getRuntime().availableProcessors() * 8;
243 }
244 long keepAliveTime = conf.getLong("hbase.region.replica.replication.threads.keepalivetime", 60);
245 LinkedBlockingQueue<Runnable> workQueue =
246 new LinkedBlockingQueue<Runnable>(maxThreads *
247 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
248 HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
249 ThreadPoolExecutor tpe = new ThreadPoolExecutor(
250 coreThreads,
251 maxThreads,
252 keepAliveTime,
253 TimeUnit.SECONDS,
254 workQueue,
255 Threads.newDaemonThreadFactory(this.getClass().getSimpleName() + "-rpc-shared-"));
256 tpe.allowCoreThreadTimeOut(true);
257 return tpe;
258 }
259
260 @Override
261 public boolean replicate(ReplicateContext replicateContext) {
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285 while (this.isRunning()) {
286 try {
287 for (Entry entry: replicateContext.getEntries()) {
288 entryBuffers.appendEntry(entry);
289 }
290 outputSink.flush();
291 ctx.getMetrics().incrLogEditsFiltered(
292 outputSink.getSkippedEditsCounter().getAndSet(0));
293 return true;
294 } catch (InterruptedException e) {
295 Thread.currentThread().interrupt();
296 return false;
297 } catch (IOException e) {
298 LOG.warn("Received IOException while trying to replicate"
299 + StringUtils.stringifyException(e));
300 }
301 }
302
303 return false;
304 }
305
306 @Override
307 public boolean canReplicateToSameCluster() {
308 return true;
309 }
310
311 @Override
312 protected WALEntryFilter getScopeWALEntryFilter() {
313
314 return null;
315 }
316
317 static class RegionReplicaOutputSink extends OutputSink {
318 private final RegionReplicaSinkWriter sinkWriter;
319 private final TableDescriptors tableDescriptors;
320 private final Cache<TableName, Boolean> memstoreReplicationEnabled;
321
322 public RegionReplicaOutputSink(PipelineController controller, TableDescriptors tableDescriptors,
323 EntryBuffers entryBuffers, ClusterConnection connection, ExecutorService pool,
324 int numWriters, int operationTimeout) {
325 super(controller, entryBuffers, numWriters);
326 this.sinkWriter =
327 new RegionReplicaSinkWriter(this, connection, pool, operationTimeout, tableDescriptors);
328 this.tableDescriptors = tableDescriptors;
329
330
331
332
333
334 int memstoreReplicationEnabledCacheExpiryMs = connection.getConfiguration()
335 .getInt("hbase.region.replica.replication.cache.memstoreReplicationEnabled.expiryMs", 5000);
336 this.memstoreReplicationEnabled = CacheBuilder.newBuilder()
337 .expireAfterWrite(memstoreReplicationEnabledCacheExpiryMs, TimeUnit.MILLISECONDS)
338 .initialCapacity(10)
339 .maximumSize(1000)
340 .build();
341 }
342
343 @Override
344 public void append(RegionEntryBuffer buffer) throws IOException {
345 List<Entry> entries = buffer.getEntryBuffer();
346
347 if (entries.isEmpty() || entries.get(0).getEdit().getCells().isEmpty()) {
348 return;
349 }
350
351
352
353 if (!requiresReplication(buffer.getTableName(), entries)) {
354 return;
355 }
356
357 sinkWriter.append(buffer.getTableName(), buffer.getEncodedRegionName(),
358 entries.get(0).getEdit().getCells().get(0).getRow(), entries);
359 }
360
361 @Override
362 public boolean flush() throws IOException {
363
364
365 entryBuffers.waitUntilDrained();
366 return super.flush();
367 }
368
369 @Override
370 public boolean keepRegionEvent(Entry entry) {
371 return true;
372 }
373
374 @Override
375 public List<Path> finishWritingAndClose() throws IOException {
376 finishWriting(true);
377 return null;
378 }
379
380 @Override
381 public Map<byte[], Long> getOutputCounts() {
382 return null;
383 }
384
385 @Override
386 public int getNumberOfRecoveredRegions() {
387 return 0;
388 }
389
390 AtomicLong getSkippedEditsCounter() {
391 return skippedEdits;
392 }
393
394
395
396
397
398
399 private boolean requiresReplication(final TableName tableName, final List<Entry> entries)
400 throws IOException {
401
402 if (tableDescriptors == null) return true;
403
404 Boolean requiresReplication = memstoreReplicationEnabled.getIfPresent(tableName);
405 if (requiresReplication == null) {
406
407
408 HTableDescriptor htd = tableDescriptors.get(tableName);
409 requiresReplication = htd == null || htd.hasRegionMemstoreReplication();
410 memstoreReplicationEnabled.put(tableName, requiresReplication);
411 }
412
413
414
415 if (!requiresReplication) {
416 int skipEdits = 0;
417 java.util.Iterator<Entry> it = entries.iterator();
418 while (it.hasNext()) {
419 Entry entry = it.next();
420 if (entry.getEdit().isMetaEdit()) {
421 requiresReplication = true;
422 } else {
423 it.remove();
424 skipEdits++;
425 }
426 }
427 skippedEdits.addAndGet(skipEdits);
428 }
429 return requiresReplication;
430 }
431 }
432
433 static class RegionReplicaSinkWriter extends SinkWriter {
434 RegionReplicaOutputSink sink;
435 ClusterConnection connection;
436 RpcControllerFactory rpcControllerFactory;
437 RpcRetryingCallerFactory rpcRetryingCallerFactory;
438 int operationTimeout;
439 ExecutorService pool;
440 Cache<TableName, Boolean> disabledAndDroppedTables;
441 TableDescriptors tableDescriptors;
442
443 public RegionReplicaSinkWriter(RegionReplicaOutputSink sink, ClusterConnection connection,
444 ExecutorService pool, int operationTimeout, TableDescriptors tableDescriptors) {
445 this.sink = sink;
446 this.connection = connection;
447 this.operationTimeout = operationTimeout;
448 this.rpcRetryingCallerFactory
449 = RpcRetryingCallerFactory.instantiate(connection.getConfiguration());
450 this.rpcControllerFactory = RpcControllerFactory.instantiate(connection.getConfiguration());
451 this.pool = pool;
452 this.tableDescriptors = tableDescriptors;
453
454 int nonExistentTableCacheExpiryMs = connection.getConfiguration()
455 .getInt("hbase.region.replica.replication.cache.disabledAndDroppedTables.expiryMs", 5000);
456
457
458
459 disabledAndDroppedTables = CacheBuilder.newBuilder()
460 .expireAfterWrite(nonExistentTableCacheExpiryMs, TimeUnit.MILLISECONDS)
461 .initialCapacity(10)
462 .maximumSize(1000)
463 .build();
464 }
465
466 public void append(TableName tableName, byte[] encodedRegionName, byte[] row,
467 List<Entry> entries) throws IOException {
468
469 if (disabledAndDroppedTables.getIfPresent(tableName) != null) {
470 if (LOG.isTraceEnabled()) {
471 LOG.trace("Skipping " + entries.size() + " entries because table " + tableName
472 + " is cached as a disabled or dropped table");
473 for (Entry entry : entries) {
474 LOG.trace("Skipping : " + entry);
475 }
476 }
477 sink.getSkippedEditsCounter().addAndGet(entries.size());
478 return;
479 }
480
481
482
483
484 RegionLocations locations = null;
485 boolean useCache = true;
486 while (true) {
487
488 try {
489 locations = RegionReplicaReplayCallable
490 .getRegionLocations(connection, tableName, row, useCache, 0);
491
492 if (locations == null) {
493 throw new HBaseIOException("Cannot locate locations for "
494 + tableName + ", row:" + Bytes.toStringBinary(row));
495 }
496 } catch (TableNotFoundException e) {
497 if (LOG.isTraceEnabled()) {
498 LOG.trace("Skipping " + entries.size() + " entries because table " + tableName
499 + " is dropped. Adding table to cache.");
500 for (Entry entry : entries) {
501 LOG.trace("Skipping : " + entry);
502 }
503 }
504 disabledAndDroppedTables.put(tableName, Boolean.TRUE);
505
506 sink.getSkippedEditsCounter().addAndGet(entries.size());
507 return;
508 }
509
510
511
512 HRegionLocation primaryLocation = locations.getDefaultRegionLocation();
513 if (!Bytes.equals(primaryLocation.getRegionInfo().getEncodedNameAsBytes(),
514 encodedRegionName)) {
515 if (useCache) {
516 useCache = false;
517 continue;
518 }
519 if (LOG.isTraceEnabled()) {
520 LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
521 + " because located region " + primaryLocation.getRegionInfo().getEncodedName()
522 + " is different than the original region " + Bytes.toStringBinary(encodedRegionName)
523 + " from WALEdit");
524 for (Entry entry : entries) {
525 LOG.trace("Skipping : " + entry);
526 }
527 }
528 sink.getSkippedEditsCounter().addAndGet(entries.size());
529 return;
530 }
531 break;
532 }
533
534 if (locations.size() == 1) {
535 return;
536 }
537
538 ArrayList<Future<ReplicateWALEntryResponse>> tasks
539 = new ArrayList<Future<ReplicateWALEntryResponse>>(locations.size() - 1);
540
541
542
543 for (int replicaId = 0; replicaId < locations.size(); replicaId++) {
544 HRegionLocation location = locations.getRegionLocation(replicaId);
545 if (!RegionReplicaUtil.isDefaultReplica(replicaId)) {
546 HRegionInfo regionInfo = location == null
547 ? RegionReplicaUtil.getRegionInfoForReplica(
548 locations.getDefaultRegionLocation().getRegionInfo(), replicaId)
549 : location.getRegionInfo();
550 RegionReplicaReplayCallable callable = new RegionReplicaReplayCallable(connection,
551 rpcControllerFactory, tableName, location, regionInfo, row, entries,
552 sink.getSkippedEditsCounter());
553 Future<ReplicateWALEntryResponse> task = pool.submit(
554 new RetryingRpcCallable<ReplicateWALEntryResponse>(rpcRetryingCallerFactory,
555 callable, operationTimeout));
556 tasks.add(task);
557 }
558 }
559
560 boolean tasksCancelled = false;
561 for (int replicaId = 0; replicaId < tasks.size(); replicaId++) {
562 try {
563 tasks.get(replicaId).get();
564 } catch (InterruptedException e) {
565 throw new InterruptedIOException(e.getMessage());
566 } catch (ExecutionException e) {
567 Throwable cause = e.getCause();
568 boolean canBeSkipped = false;
569 if (cause instanceof IOException) {
570
571
572
573
574
575
576 if (cause instanceof TableNotFoundException || connection.isTableDisabled(tableName)) {
577 disabledAndDroppedTables.put(tableName, Boolean.TRUE);
578 canBeSkipped = true;
579 } else if (tableDescriptors != null) {
580 HTableDescriptor tableDescriptor = tableDescriptors.get(tableName);
581 if (tableDescriptor != null
582
583 && tableDescriptor.getRegionReplication() <= (replicaId + 1)) {
584 canBeSkipped = true;
585 }
586 }
587 if (canBeSkipped) {
588 if (LOG.isTraceEnabled()) {
589 LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
590 + " because received exception for dropped or disabled table",
591 cause);
592 for (Entry entry : entries) {
593 LOG.trace("Skipping : " + entry);
594 }
595 }
596
597 if (!tasksCancelled) {
598 sink.getSkippedEditsCounter().addAndGet(entries.size());
599 tasksCancelled = true;
600 }
601 continue;
602 }
603
604 throw (IOException)cause;
605 }
606
607 throw new IOException(cause);
608 }
609 }
610 }
611 }
612
613 static class RetryingRpcCallable<V> implements Callable<V> {
614 RpcRetryingCallerFactory factory;
615 RetryingCallable<V> callable;
616 int timeout;
617 public RetryingRpcCallable(RpcRetryingCallerFactory factory, RetryingCallable<V> callable,
618 int timeout) {
619 this.factory = factory;
620 this.callable = callable;
621 this.timeout = timeout;
622 }
623 @Override
624 public V call() throws Exception {
625 return factory.<V>newCaller().callWithRetries(callable, timeout);
626 }
627 }
628
629
630
631
632
633 static class RegionReplicaReplayCallable
634 extends RegionAdminServiceCallable<ReplicateWALEntryResponse> {
635
636 private final List<Entry> entries;
637 private final byte[] initialEncodedRegionName;
638 private final AtomicLong skippedEntries;
639
640 public RegionReplicaReplayCallable(ClusterConnection connection,
641 RpcControllerFactory rpcControllerFactory, TableName tableName,
642 HRegionLocation location, HRegionInfo regionInfo, byte[] row,List<Entry> entries,
643 AtomicLong skippedEntries) {
644 super(connection, rpcControllerFactory, location, tableName, row, regionInfo.getReplicaId());
645 this.entries = entries;
646 this.skippedEntries = skippedEntries;
647 this.initialEncodedRegionName = regionInfo.getEncodedNameAsBytes();
648 }
649
650 @Override
651 public ReplicateWALEntryResponse call(int timeout) throws IOException {
652 return replayToServer(this.entries, timeout);
653 }
654
655 private ReplicateWALEntryResponse replayToServer(List<Entry> entries, int timeout)
656 throws IOException {
657
658
659
660 boolean skip = false;
661
662 if (!Bytes.equals(location.getRegionInfo().getEncodedNameAsBytes(),
663 initialEncodedRegionName)) {
664 skip = true;
665 }
666 if (!entries.isEmpty() && !skip) {
667 Entry[] entriesArray = new Entry[entries.size()];
668 entriesArray = entries.toArray(entriesArray);
669
670
671 Pair<AdminProtos.ReplicateWALEntryRequest, CellScanner> p =
672 ReplicationProtbufUtil.buildReplicateWALEntryRequest(entriesArray,
673 location.getRegionInfo().getEncodedNameAsBytes(), null, null, null);
674 try {
675 HBaseRpcController controller = rpcControllerFactory.newController(p.getSecond());
676 controller.setCallTimeout(timeout);
677 controller.setPriority(tableName);
678 return stub.replay(controller, p.getFirst());
679 } catch (ServiceException se) {
680 throw ProtobufUtil.getRemoteException(se);
681 }
682 }
683
684 if (skip) {
685 if (LOG.isTraceEnabled()) {
686 LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
687 + " because located region " + location.getRegionInfo().getEncodedName()
688 + " is different than the original region "
689 + Bytes.toStringBinary(initialEncodedRegionName) + " from WALEdit");
690 for (Entry entry : entries) {
691 LOG.trace("Skipping : " + entry);
692 }
693 }
694 skippedEntries.addAndGet(entries.size());
695 }
696 return ReplicateWALEntryResponse.newBuilder().build();
697 }
698 }
699 }