View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.replication.regionserver;
20  
21  import com.google.common.cache.Cache;
22  import com.google.common.cache.CacheBuilder;
23  import com.google.common.collect.Lists;
24  import com.google.protobuf.ServiceException;
25  
26  import java.io.IOException;
27  import java.io.InterruptedIOException;
28  import java.util.ArrayList;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.concurrent.Callable;
32  import java.util.concurrent.ExecutionException;
33  import java.util.concurrent.ExecutorService;
34  import java.util.concurrent.Future;
35  import java.util.concurrent.LinkedBlockingQueue;
36  import java.util.concurrent.ThreadPoolExecutor;
37  import java.util.concurrent.TimeUnit;
38  import java.util.concurrent.atomic.AtomicLong;
39  
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.conf.Configuration;
43  import org.apache.hadoop.fs.Path;
44  import org.apache.hadoop.hbase.CellScanner;
45  import org.apache.hadoop.hbase.HBaseConfiguration;
46  import org.apache.hadoop.hbase.HBaseIOException;
47  import org.apache.hadoop.hbase.HConstants;
48  import org.apache.hadoop.hbase.HRegionInfo;
49  import org.apache.hadoop.hbase.HRegionLocation;
50  import org.apache.hadoop.hbase.HTableDescriptor;
51  import org.apache.hadoop.hbase.RegionLocations;
52  import org.apache.hadoop.hbase.TableDescriptors;
53  import org.apache.hadoop.hbase.TableName;
54  import org.apache.hadoop.hbase.TableNotFoundException;
55  import org.apache.hadoop.hbase.classification.InterfaceAudience;
56  import org.apache.hadoop.hbase.client.ClusterConnection;
57  import org.apache.hadoop.hbase.client.ConnectionFactory;
58  import org.apache.hadoop.hbase.client.RegionAdminServiceCallable;
59  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
60  import org.apache.hadoop.hbase.client.RetryingCallable;
61  import org.apache.hadoop.hbase.client.RpcRetryingCallerFactory;
62  import org.apache.hadoop.hbase.ipc.HBaseRpcController;
63  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
64  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
65  import org.apache.hadoop.hbase.protobuf.ReplicationProtbufUtil;
66  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
67  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ReplicateWALEntryResponse;
68  import org.apache.hadoop.hbase.replication.BaseWALEntryFilter;
69  import org.apache.hadoop.hbase.replication.ChainWALEntryFilter;
70  import org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint;
71  import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
72  import org.apache.hadoop.hbase.replication.WALEntryFilter;
73  import org.apache.hadoop.hbase.util.Bytes;
74  import org.apache.hadoop.hbase.util.Pair;
75  import org.apache.hadoop.hbase.util.Threads;
76  import org.apache.hadoop.hbase.wal.WAL.Entry;
77  import org.apache.hadoop.hbase.wal.WALSplitter.EntryBuffers;
78  import org.apache.hadoop.hbase.wal.WALSplitter.OutputSink;
79  import org.apache.hadoop.hbase.wal.WALSplitter.PipelineController;
80  import org.apache.hadoop.hbase.wal.WALSplitter.RegionEntryBuffer;
81  import org.apache.hadoop.hbase.wal.WALSplitter.SinkWriter;
82  import org.apache.hadoop.util.StringUtils;
83  
84  /**
85   * A {@link ReplicationEndpoint} endpoint which receives the WAL edits from the
86   * WAL, and sends the edits to replicas of regions.
87   */
88  @InterfaceAudience.Private
89  public class RegionReplicaReplicationEndpoint extends HBaseReplicationEndpoint {
90  
91    private static final Log LOG = LogFactory.getLog(RegionReplicaReplicationEndpoint.class);
92  
93    // Can be configured differently than hbase.client.retries.number
94    private static String CLIENT_RETRIES_NUMBER
95      = "hbase.region.replica.replication.client.retries.number";
96  
97    private Configuration conf;
98    private ClusterConnection connection;
99    private TableDescriptors tableDescriptors;
100 
101   // Reuse WALSplitter constructs as a WAL pipe
102   private PipelineController controller;
103   private RegionReplicaOutputSink outputSink;
104   private EntryBuffers entryBuffers;
105 
106   // Number of writer threads
107   private int numWriterThreads;
108 
109   private int operationTimeout;
110 
111   private ExecutorService pool;
112 
113   /**
114    * Skips the entries which has original seqId. Only entries persisted via distributed log replay
115    * have their original seq Id fields set.
116    */
117   private static class SkipReplayedEditsFilter extends BaseWALEntryFilter {
118     @Override
119     public Entry filter(Entry entry) {
120       // if orig seq id is set, skip replaying the entry
121       if (entry.getKey().getOrigLogSeqNum() > 0) {
122         return null;
123       }
124       return entry;
125     }
126   }
127 
128   @Override
129   public WALEntryFilter getWALEntryfilter() {
130     WALEntryFilter superFilter = super.getWALEntryfilter();
131     WALEntryFilter skipReplayedEditsFilter = getSkipReplayedEditsFilter();
132 
133     if (superFilter == null) {
134       return skipReplayedEditsFilter;
135     }
136 
137     if (skipReplayedEditsFilter == null) {
138       return superFilter;
139     }
140 
141     ArrayList<WALEntryFilter> filters = Lists.newArrayList();
142     filters.add(superFilter);
143     filters.add(skipReplayedEditsFilter);
144     return new ChainWALEntryFilter(filters);
145   }
146 
147   protected WALEntryFilter getSkipReplayedEditsFilter() {
148     return new SkipReplayedEditsFilter();
149   }
150 
151   @Override
152   public void init(Context context) throws IOException {
153     super.init(context);
154 
155     this.conf = HBaseConfiguration.create(context.getConfiguration());
156     this.tableDescriptors = context.getTableDescriptors();
157 
158     // HRS multiplies client retries by 10 globally for meta operations, but we do not want this.
159     // We are resetting it here because we want default number of retries (35) rather than 10 times
160     // that which makes very long retries for disabled tables etc.
161     int defaultNumRetries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
162       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
163     if (defaultNumRetries > 10) {
164       int mult = conf.getInt("hbase.client.serverside.retries.multiplier", 10);
165       defaultNumRetries = defaultNumRetries / mult; // reset if HRS has multiplied this already
166     }
167 
168     conf.setInt("hbase.client.serverside.retries.multiplier", 1);
169     int numRetries = conf.getInt(CLIENT_RETRIES_NUMBER, defaultNumRetries);
170     conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, numRetries);
171 
172     this.numWriterThreads = this.conf.getInt(
173       "hbase.region.replica.replication.writer.threads", 3);
174     controller = new PipelineController();
175     entryBuffers = new EntryBuffers(controller,
176       this.conf.getInt("hbase.region.replica.replication.buffersize",
177           128*1024*1024));
178 
179     // use the regular RPC timeout for replica replication RPC's
180     this.operationTimeout = conf.getInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,
181       HConstants.DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT);
182   }
183 
184   @Override
185   protected void doStart() {
186     try {
187       connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);
188       this.pool = getDefaultThreadPool(conf);
189       outputSink = new RegionReplicaOutputSink(controller, tableDescriptors, entryBuffers,
190         connection, pool, numWriterThreads, operationTimeout);
191       outputSink.startWriterThreads();
192       super.doStart();
193     } catch (IOException ex) {
194       LOG.warn("Received exception while creating connection :" + ex);
195       notifyFailed(ex);
196     }
197   }
198 
199   @Override
200   protected void doStop() {
201     if (outputSink != null) {
202       try {
203         outputSink.finishWritingAndClose();
204       } catch (IOException ex) {
205         LOG.warn("Got exception while trying to close OutputSink");
206         LOG.warn(ex);
207       }
208     }
209     if (this.pool != null) {
210       this.pool.shutdownNow();
211       try {
212         // wait for 10 sec
213         boolean shutdown = this.pool.awaitTermination(10000, TimeUnit.MILLISECONDS);
214         if (!shutdown) {
215           LOG.warn("Failed to shutdown the thread pool after 10 seconds");
216         }
217       } catch (InterruptedException e) {
218         LOG.warn("Got interrupted while waiting for the thread pool to shut down" + e);
219       }
220     }
221     if (connection != null) {
222       try {
223         connection.close();
224       } catch (IOException ex) {
225         LOG.warn("Got exception closing connection :" + ex);
226       }
227     }
228     super.doStop();
229   }
230 
231   /**
232    * Returns a Thread pool for the RPC's to region replicas. Similar to
233    * Connection's thread pool.
234    */
235   private ExecutorService getDefaultThreadPool(Configuration conf) {
236     int maxThreads = conf.getInt("hbase.region.replica.replication.threads.max", 256);
237     int coreThreads = conf.getInt("hbase.region.replica.replication.threads.core", 16);
238     if (maxThreads == 0) {
239       maxThreads = Runtime.getRuntime().availableProcessors() * 8;
240     }
241     if (coreThreads == 0) {
242       coreThreads = Runtime.getRuntime().availableProcessors() * 8;
243     }
244     long keepAliveTime = conf.getLong("hbase.region.replica.replication.threads.keepalivetime", 60);
245     LinkedBlockingQueue<Runnable> workQueue =
246         new LinkedBlockingQueue<Runnable>(maxThreads *
247             conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
248               HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
249     ThreadPoolExecutor tpe = new ThreadPoolExecutor(
250       coreThreads,
251       maxThreads,
252       keepAliveTime,
253       TimeUnit.SECONDS,
254       workQueue,
255       Threads.newDaemonThreadFactory(this.getClass().getSimpleName() + "-rpc-shared-"));
256     tpe.allowCoreThreadTimeOut(true);
257     return tpe;
258   }
259 
260   @Override
261   public boolean replicate(ReplicateContext replicateContext) {
262     /* A note on batching in RegionReplicaReplicationEndpoint (RRRE):
263      *
264      * RRRE relies on batching from two different mechanisms. The first is the batching from
265      * ReplicationSource since RRRE is a ReplicationEndpoint driven by RS. RS reads from a single
266      * WAL file filling up a buffer of heap size "replication.source.size.capacity"(64MB) or at most
267      * "replication.source.nb.capacity" entries or until it sees the end of file (in live tailing).
268      * Then RS passes all the buffered edits in this replicate() call context. RRRE puts the edits
269      * to the WALSplitter.EntryBuffers which is a blocking buffer space of up to
270      * "hbase.region.replica.replication.buffersize" (128MB) in size. This buffer splits the edits
271      * based on regions.
272      *
273      * There are "hbase.region.replica.replication.writer.threads"(default 3) writer threads which
274      * pick largest per-region buffer and send it to the SinkWriter (see RegionReplicaOutputSink).
275      * The SinkWriter in this case will send the wal edits to all secondary region replicas in
276      * parallel via a retrying rpc call. EntryBuffers guarantees that while a buffer is
277      * being written to the sink, another buffer for the same region will not be made available to
278      * writers ensuring regions edits are not replayed out of order.
279      *
280      * The replicate() call won't return until all the buffers are sent and ack'd by the sinks so
281      * that the replication can assume all edits are persisted. We may be able to do a better
282      * pipelining between the replication thread and output sinks later if it becomes a bottleneck.
283      */
284 
285     while (this.isRunning()) {
286       try {
287         for (Entry entry: replicateContext.getEntries()) {
288           entryBuffers.appendEntry(entry);
289         }
290         outputSink.flush(); // make sure everything is flushed
291         ctx.getMetrics().incrLogEditsFiltered(
292           outputSink.getSkippedEditsCounter().getAndSet(0));
293         return true;
294       } catch (InterruptedException e) {
295         Thread.currentThread().interrupt();
296         return false;
297       } catch (IOException e) {
298         LOG.warn("Received IOException while trying to replicate"
299             + StringUtils.stringifyException(e));
300       }
301     }
302 
303     return false;
304   }
305 
306   @Override
307   public boolean canReplicateToSameCluster() {
308     return true;
309   }
310 
311   @Override
312   protected WALEntryFilter getScopeWALEntryFilter() {
313     // we do not care about scope. We replicate everything.
314     return null;
315   }
316 
317   static class RegionReplicaOutputSink extends OutputSink {
318     private final RegionReplicaSinkWriter sinkWriter;
319     private final TableDescriptors tableDescriptors;
320     private final Cache<TableName, Boolean> memstoreReplicationEnabled;
321 
322     public RegionReplicaOutputSink(PipelineController controller, TableDescriptors tableDescriptors,
323         EntryBuffers entryBuffers, ClusterConnection connection, ExecutorService pool,
324         int numWriters, int operationTimeout) {
325       super(controller, entryBuffers, numWriters);
326       this.sinkWriter =
327           new RegionReplicaSinkWriter(this, connection, pool, operationTimeout, tableDescriptors);
328       this.tableDescriptors = tableDescriptors;
329 
330       // A cache for the table "memstore replication enabled" flag.
331       // It has a default expiry of 5 sec. This means that if the table is altered
332       // with a different flag value, we might miss to replicate for that amount of
333       // time. But this cache avoid the slow lookup and parsing of the TableDescriptor.
334       int memstoreReplicationEnabledCacheExpiryMs = connection.getConfiguration()
335         .getInt("hbase.region.replica.replication.cache.memstoreReplicationEnabled.expiryMs", 5000);
336       this.memstoreReplicationEnabled = CacheBuilder.newBuilder()
337         .expireAfterWrite(memstoreReplicationEnabledCacheExpiryMs, TimeUnit.MILLISECONDS)
338         .initialCapacity(10)
339         .maximumSize(1000)
340         .build();
341     }
342 
343     @Override
344     public void append(RegionEntryBuffer buffer) throws IOException {
345       List<Entry> entries = buffer.getEntryBuffer();
346 
347       if (entries.isEmpty() || entries.get(0).getEdit().getCells().isEmpty()) {
348         return;
349       }
350 
351       // meta edits (e.g. flush) are always replicated.
352       // data edits (e.g. put) are replicated if the table requires them.
353       if (!requiresReplication(buffer.getTableName(), entries)) {
354         return;
355       }
356 
357       sinkWriter.append(buffer.getTableName(), buffer.getEncodedRegionName(),
358         entries.get(0).getEdit().getCells().get(0).getRow(), entries);
359     }
360 
361     @Override
362     public boolean flush() throws IOException {
363       // nothing much to do for now. Wait for the Writer threads to finish up
364       // append()'ing the data.
365       entryBuffers.waitUntilDrained();
366       return super.flush();
367     }
368 
369     @Override
370     public boolean keepRegionEvent(Entry entry) {
371       return true;
372     }
373 
374     @Override
375     public List<Path> finishWritingAndClose() throws IOException {
376       finishWriting(true);
377       return null;
378     }
379 
380     @Override
381     public Map<byte[], Long> getOutputCounts() {
382       return null; // only used in tests
383     }
384 
385     @Override
386     public int getNumberOfRecoveredRegions() {
387       return 0;
388     }
389 
390     AtomicLong getSkippedEditsCounter() {
391       return skippedEdits;
392     }
393 
394     /**
395      * returns true if the specified entry must be replicated.
396      * We should always replicate meta operations (e.g. flush)
397      * and use the user HTD flag to decide whether or not replicate the memstore.
398      */
399     private boolean requiresReplication(final TableName tableName, final List<Entry> entries)
400         throws IOException {
401       // unit-tests may not the TableDescriptors, bypass the check and always replicate
402       if (tableDescriptors == null) return true;
403 
404       Boolean requiresReplication = memstoreReplicationEnabled.getIfPresent(tableName);
405       if (requiresReplication == null) {
406         // check if the table requires memstore replication
407         // some unit-test drop the table, so we should do a bypass check and always replicate.
408         HTableDescriptor htd = tableDescriptors.get(tableName);
409         requiresReplication = htd == null || htd.hasRegionMemstoreReplication();
410         memstoreReplicationEnabled.put(tableName, requiresReplication);
411       }
412 
413       // if memstore replication is not required, check the entries.
414       // meta edits (e.g. flush) must be always replicated.
415       if (!requiresReplication) {
416         int skipEdits = 0;
417         java.util.Iterator<Entry> it = entries.iterator();
418         while (it.hasNext()) {
419           Entry entry = it.next();
420           if (entry.getEdit().isMetaEdit()) {
421             requiresReplication = true;
422           } else {
423             it.remove();
424             skipEdits++;
425           }
426         }
427         skippedEdits.addAndGet(skipEdits);
428       }
429       return requiresReplication;
430     }
431   }
432 
433   static class RegionReplicaSinkWriter extends SinkWriter {
434     RegionReplicaOutputSink sink;
435     ClusterConnection connection;
436     RpcControllerFactory rpcControllerFactory;
437     RpcRetryingCallerFactory rpcRetryingCallerFactory;
438     int operationTimeout;
439     ExecutorService pool;
440     Cache<TableName, Boolean> disabledAndDroppedTables;
441     TableDescriptors tableDescriptors;
442 
443     public RegionReplicaSinkWriter(RegionReplicaOutputSink sink, ClusterConnection connection,
444         ExecutorService pool, int operationTimeout, TableDescriptors tableDescriptors) {
445       this.sink = sink;
446       this.connection = connection;
447       this.operationTimeout = operationTimeout;
448       this.rpcRetryingCallerFactory
449         = RpcRetryingCallerFactory.instantiate(connection.getConfiguration());
450       this.rpcControllerFactory = RpcControllerFactory.instantiate(connection.getConfiguration());
451       this.pool = pool;
452       this.tableDescriptors = tableDescriptors;
453 
454       int nonExistentTableCacheExpiryMs = connection.getConfiguration()
455         .getInt("hbase.region.replica.replication.cache.disabledAndDroppedTables.expiryMs", 5000);
456       // A cache for non existing tables that have a default expiry of 5 sec. This means that if the
457       // table is created again with the same name, we might miss to replicate for that amount of
458       // time. But this cache prevents overloading meta requests for every edit from a deleted file.
459       disabledAndDroppedTables = CacheBuilder.newBuilder()
460         .expireAfterWrite(nonExistentTableCacheExpiryMs, TimeUnit.MILLISECONDS)
461         .initialCapacity(10)
462         .maximumSize(1000)
463         .build();
464     }
465 
466     public void append(TableName tableName, byte[] encodedRegionName, byte[] row,
467         List<Entry> entries) throws IOException {
468 
469       if (disabledAndDroppedTables.getIfPresent(tableName) != null) {
470         if (LOG.isTraceEnabled()) {
471           LOG.trace("Skipping " + entries.size() + " entries because table " + tableName
472             + " is cached as a disabled or dropped table");
473           for (Entry entry : entries) {
474             LOG.trace("Skipping : " + entry);
475           }
476         }
477         sink.getSkippedEditsCounter().addAndGet(entries.size());
478         return;
479       }
480 
481       // If the table is disabled or dropped, we should not replay the entries, and we can skip
482       // replaying them. However, we might not know whether the table is disabled until we
483       // invalidate the cache and check from meta
484       RegionLocations locations = null;
485       boolean useCache = true;
486       while (true) {
487         // get the replicas of the primary region
488         try {
489           locations = RegionReplicaReplayCallable
490               .getRegionLocations(connection, tableName, row, useCache, 0);
491 
492           if (locations == null) {
493             throw new HBaseIOException("Cannot locate locations for "
494                 + tableName + ", row:" + Bytes.toStringBinary(row));
495           }
496         } catch (TableNotFoundException e) {
497           if (LOG.isTraceEnabled()) {
498             LOG.trace("Skipping " + entries.size() + " entries because table " + tableName
499               + " is dropped. Adding table to cache.");
500             for (Entry entry : entries) {
501               LOG.trace("Skipping : " + entry);
502             }
503           }
504           disabledAndDroppedTables.put(tableName, Boolean.TRUE); // put to cache. Value ignored
505           // skip this entry
506           sink.getSkippedEditsCounter().addAndGet(entries.size());
507           return;
508         }
509 
510         // check whether we should still replay this entry. If the regions are changed, or the
511         // entry is not coming from the primary region, filter it out.
512         HRegionLocation primaryLocation = locations.getDefaultRegionLocation();
513         if (!Bytes.equals(primaryLocation.getRegionInfo().getEncodedNameAsBytes(),
514           encodedRegionName)) {
515           if (useCache) {
516             useCache = false;
517             continue; // this will retry location lookup
518           }
519           if (LOG.isTraceEnabled()) {
520             LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
521               + " because located region " + primaryLocation.getRegionInfo().getEncodedName()
522               + " is different than the original region " + Bytes.toStringBinary(encodedRegionName)
523               + " from WALEdit");
524             for (Entry entry : entries) {
525               LOG.trace("Skipping : " + entry);
526             }
527           }
528           sink.getSkippedEditsCounter().addAndGet(entries.size());
529           return;
530         }
531         break;
532       }
533 
534       if (locations.size() == 1) {
535         return;
536       }
537 
538       ArrayList<Future<ReplicateWALEntryResponse>> tasks
539         = new ArrayList<Future<ReplicateWALEntryResponse>>(locations.size() - 1);
540 
541       // All passed entries should belong to one region because it is coming from the EntryBuffers
542       // split per region. But the regions might split and merge (unlike log recovery case).
543       for (int replicaId = 0; replicaId < locations.size(); replicaId++) {
544         HRegionLocation location = locations.getRegionLocation(replicaId);
545         if (!RegionReplicaUtil.isDefaultReplica(replicaId)) {
546           HRegionInfo regionInfo = location == null
547               ? RegionReplicaUtil.getRegionInfoForReplica(
548                 locations.getDefaultRegionLocation().getRegionInfo(), replicaId)
549               : location.getRegionInfo();
550           RegionReplicaReplayCallable callable = new RegionReplicaReplayCallable(connection,
551             rpcControllerFactory, tableName, location, regionInfo, row, entries,
552             sink.getSkippedEditsCounter());
553            Future<ReplicateWALEntryResponse> task = pool.submit(
554              new RetryingRpcCallable<ReplicateWALEntryResponse>(rpcRetryingCallerFactory,
555                  callable, operationTimeout));
556            tasks.add(task);
557         }
558       }
559 
560       boolean tasksCancelled = false;
561       for (int replicaId = 0; replicaId < tasks.size(); replicaId++) {
562         try {
563           tasks.get(replicaId).get();
564         } catch (InterruptedException e) {
565           throw new InterruptedIOException(e.getMessage());
566         } catch (ExecutionException e) {
567           Throwable cause = e.getCause();
568           boolean canBeSkipped = false;
569           if (cause instanceof IOException) {
570             // The table can be disabled or dropped at this time. For disabled tables, we have no
571             // cheap mechanism to detect this case because meta does not contain this information.
572             // HConnection.isTableDisabled() is a zk call which we cannot do for every replay RPC.
573             // So instead we start the replay RPC with retries and
574             // check whether the table is dropped or disabled which might cause
575             // SocketTimeoutException, or RetriesExhaustedException or similar if we get IOE.
576             if (cause instanceof TableNotFoundException || connection.isTableDisabled(tableName)) {
577               disabledAndDroppedTables.put(tableName, Boolean.TRUE); // put to cache for later.
578               canBeSkipped = true;
579             } else if (tableDescriptors != null) {
580               HTableDescriptor tableDescriptor = tableDescriptors.get(tableName);
581               if (tableDescriptor != null
582                   // (replicaId + 1) as no task is added for primary replica for replication
583                   && tableDescriptor.getRegionReplication() <= (replicaId + 1)) {
584                 canBeSkipped = true;
585               }
586             }
587             if (canBeSkipped) {
588               if (LOG.isTraceEnabled()) {
589                 LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
590                     + " because received exception for dropped or disabled table",
591                   cause);
592                 for (Entry entry : entries) {
593                   LOG.trace("Skipping : " + entry);
594                 }
595               }
596 
597               if (!tasksCancelled) {
598                 sink.getSkippedEditsCounter().addAndGet(entries.size());
599                 tasksCancelled = true; // so that we do not add to skipped counter again
600               }
601               continue;
602             }
603             // otherwise rethrow
604             throw (IOException)cause;
605           }
606           // unexpected exception
607           throw new IOException(cause);
608         }
609       }
610     }
611   }
612 
613   static class RetryingRpcCallable<V> implements Callable<V> {
614     RpcRetryingCallerFactory factory;
615     RetryingCallable<V> callable;
616     int timeout;
617     public RetryingRpcCallable(RpcRetryingCallerFactory factory, RetryingCallable<V> callable,
618         int timeout) {
619       this.factory = factory;
620       this.callable = callable;
621       this.timeout = timeout;
622     }
623     @Override
624     public V call() throws Exception {
625       return factory.<V>newCaller().callWithRetries(callable, timeout);
626     }
627   }
628 
629   /**
630    * Calls replay on the passed edits for the given set of entries belonging to the region. It skips
631    * the entry if the region boundaries have changed or the region is gone.
632    */
633   static class RegionReplicaReplayCallable
634     extends RegionAdminServiceCallable<ReplicateWALEntryResponse> {
635 
636     private final List<Entry> entries;
637     private final byte[] initialEncodedRegionName;
638     private final AtomicLong skippedEntries;
639 
640     public RegionReplicaReplayCallable(ClusterConnection connection,
641         RpcControllerFactory rpcControllerFactory, TableName tableName,
642         HRegionLocation location, HRegionInfo regionInfo, byte[] row,List<Entry> entries,
643         AtomicLong skippedEntries) {
644       super(connection, rpcControllerFactory, location, tableName, row, regionInfo.getReplicaId());
645       this.entries = entries;
646       this.skippedEntries = skippedEntries;
647       this.initialEncodedRegionName = regionInfo.getEncodedNameAsBytes();
648     }
649 
650     @Override
651     public ReplicateWALEntryResponse call(int timeout) throws IOException {
652       return replayToServer(this.entries, timeout);
653     }
654 
655     private ReplicateWALEntryResponse replayToServer(List<Entry> entries, int timeout)
656         throws IOException {
657       // check whether we should still replay this entry. If the regions are changed, or the
658       // entry is not coming form the primary region, filter it out because we do not need it.
659       // Regions can change because of (1) region split (2) region merge (3) table recreated
660       boolean skip = false;
661 
662       if (!Bytes.equals(location.getRegionInfo().getEncodedNameAsBytes(),
663         initialEncodedRegionName)) {
664         skip = true;
665       }
666       if (!entries.isEmpty() && !skip) {
667         Entry[] entriesArray = new Entry[entries.size()];
668         entriesArray = entries.toArray(entriesArray);
669 
670         // set the region name for the target region replica
671         Pair<AdminProtos.ReplicateWALEntryRequest, CellScanner> p =
672             ReplicationProtbufUtil.buildReplicateWALEntryRequest(entriesArray,
673               location.getRegionInfo().getEncodedNameAsBytes(), null, null, null);
674         try {
675           HBaseRpcController controller = rpcControllerFactory.newController(p.getSecond());
676           controller.setCallTimeout(timeout);
677           controller.setPriority(tableName);
678           return stub.replay(controller, p.getFirst());
679         } catch (ServiceException se) {
680           throw ProtobufUtil.getRemoteException(se);
681         }
682       }
683 
684       if (skip) {
685         if (LOG.isTraceEnabled()) {
686           LOG.trace("Skipping " + entries.size() + " entries in table " + tableName
687             + " because located region " + location.getRegionInfo().getEncodedName()
688             + " is different than the original region "
689             + Bytes.toStringBinary(initialEncodedRegionName) + " from WALEdit");
690           for (Entry entry : entries) {
691             LOG.trace("Skipping : " + entry);
692           }
693         }
694         skippedEntries.addAndGet(entries.size());
695       }
696       return ReplicateWALEntryResponse.newBuilder().build();
697     }
698   }
699 }