View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import com.google.common.collect.LinkedHashMultimap;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.Collection;
27  import java.util.Collections;
28  import java.util.Comparator;
29  import java.util.HashMap;
30  import java.util.HashSet;
31  import java.util.Iterator;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.NavigableMap;
35  import java.util.Random;
36  import java.util.Set;
37  import java.util.TreeMap;
38  import java.util.concurrent.Callable;
39  import java.util.concurrent.ConcurrentHashMap;
40  import java.util.concurrent.CopyOnWriteArrayList;
41  import java.util.concurrent.ScheduledThreadPoolExecutor;
42  import java.util.concurrent.ThreadFactory;
43  import java.util.concurrent.TimeUnit;
44  import java.util.concurrent.atomic.AtomicBoolean;
45  import java.util.concurrent.atomic.AtomicInteger;
46  import java.util.concurrent.locks.Lock;
47  import java.util.concurrent.locks.ReentrantLock;
48  import org.apache.commons.logging.Log;
49  import org.apache.commons.logging.LogFactory;
50  import org.apache.hadoop.conf.Configuration;
51  import org.apache.hadoop.fs.FileSystem;
52  import org.apache.hadoop.fs.Path;
53  import org.apache.hadoop.hbase.CoordinatedStateException;
54  import org.apache.hadoop.hbase.HBaseIOException;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.MetaTableAccessor;
60  import org.apache.hadoop.hbase.NotServingRegionException;
61  import org.apache.hadoop.hbase.RegionLocations;
62  import org.apache.hadoop.hbase.RegionStateListener;
63  import org.apache.hadoop.hbase.RegionTransition;
64  import org.apache.hadoop.hbase.ServerName;
65  import org.apache.hadoop.hbase.TableName;
66  import org.apache.hadoop.hbase.TableNotFoundException;
67  import org.apache.hadoop.hbase.TableStateManager;
68  import org.apache.hadoop.hbase.classification.InterfaceAudience;
69  import org.apache.hadoop.hbase.client.Admin;
70  import org.apache.hadoop.hbase.client.Admin.MasterSwitchType;
71  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
72  import org.apache.hadoop.hbase.client.Result;
73  import org.apache.hadoop.hbase.coordination.BaseCoordinatedStateManager;
74  import org.apache.hadoop.hbase.coordination.OpenRegionCoordination;
75  import org.apache.hadoop.hbase.coordination.RegionMergeCoordination;
76  import org.apache.hadoop.hbase.coordination.SplitTransactionCoordination.SplitTransactionDetails;
77  import org.apache.hadoop.hbase.coordination.ZkOpenRegionCoordination;
78  import org.apache.hadoop.hbase.coordination.ZkRegionMergeCoordination;
79  import org.apache.hadoop.hbase.exceptions.DeserializationException;
80  import org.apache.hadoop.hbase.executor.EventHandler;
81  import org.apache.hadoop.hbase.executor.EventType;
82  import org.apache.hadoop.hbase.executor.ExecutorService;
83  import org.apache.hadoop.hbase.ipc.FailedServerException;
84  import org.apache.hadoop.hbase.ipc.RpcClient;
85  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
86  import org.apache.hadoop.hbase.master.RegionState.State;
87  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
88  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
89  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
90  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
91  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
92  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
93  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
94  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
95  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
96  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
97  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
98  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
99  import org.apache.hadoop.hbase.util.ConfigUtil;
100 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
101 import org.apache.hadoop.hbase.util.FSUtils;
102 import org.apache.hadoop.hbase.util.KeyLocker;
103 import org.apache.hadoop.hbase.util.Pair;
104 import org.apache.hadoop.hbase.util.PairOfSameType;
105 import org.apache.hadoop.hbase.util.RetryCounter;
106 import org.apache.hadoop.hbase.util.Threads;
107 import org.apache.hadoop.hbase.util.Triple;
108 import org.apache.hadoop.hbase.util.VersionInfo;
109 import org.apache.hadoop.hbase.wal.DefaultWALProvider;
110 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
111 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
112 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
113 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
114 import org.apache.hadoop.ipc.RemoteException;
115 import org.apache.hadoop.util.StringUtils;
116 import org.apache.zookeeper.AsyncCallback;
117 import org.apache.zookeeper.KeeperException;
118 import org.apache.zookeeper.KeeperException.NoNodeException;
119 import org.apache.zookeeper.KeeperException.NodeExistsException;
120 import org.apache.zookeeper.data.Stat;
121 
122 /**
123  * Manages and performs region assignment.
124  * <p>
125  * Monitors ZooKeeper for events related to regions in transition.
126  * <p>
127  * Handles existing regions in transition during master failover.
128  */
129 @InterfaceAudience.Private
130 public class AssignmentManager extends ZooKeeperListener {
131   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
132 
133   public static final ServerName HBCK_CODE_SERVERNAME = ServerName.valueOf(HConstants.HBCK_CODE_NAME,
134       -1, -1L);
135 
136   static final String ALREADY_IN_TRANSITION_WAITTIME
137     = "hbase.assignment.already.intransition.waittime";
138   static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
139 
140   static final String FAILED_OPEN_RETRY_KEY = "hbase.assignment.failed.open.retry.period";
141   static final int FAILED_OPEN_RETRY_DEFAULT = 300000; // 5 minutes
142 
143   protected final MasterServices server;
144 
145   private ServerManager serverManager;
146 
147   private boolean shouldAssignRegionsWithFavoredNodes;
148 
149   private LoadBalancer balancer;
150 
151   private final MetricsAssignmentManager metricsAssignmentManager;
152 
153   private final TableLockManager tableLockManager;
154 
155   private AtomicInteger numRegionsOpened = new AtomicInteger(0);
156 
157   final private KeyLocker<String> locker = new KeyLocker<String>();
158 
159   Set<HRegionInfo> replicasToClose = Collections.synchronizedSet(new HashSet<HRegionInfo>());
160 
161   /**
162    * Map of regions to reopen after the schema of a table is changed. Key -
163    * encoded region name, value - HRegionInfo
164    */
165   private final Map <String, HRegionInfo> regionsToReopen;
166 
167   /*
168    * Maximum times we recurse an assignment/unassignment.
169    * See below in {@link #assign()} and {@link #unassign()}.
170    */
171   private final int maximumAttempts;
172 
173   /**
174    * Map of two merging regions from the region to be created.
175    */
176   private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions
177     = new HashMap<String, PairOfSameType<HRegionInfo>>();
178 
179   private final Map<HRegionInfo, PairOfSameType<HRegionInfo>> splitRegions
180   = new HashMap<HRegionInfo, PairOfSameType<HRegionInfo>>();
181 
182   /**
183    * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
184    * failure due to lack of availability of region plan or bad region plan
185    */
186   private final long sleepTimeBeforeRetryingMetaAssignment;
187 
188   /** Plans for region movement. Key is the encoded version of a region name*/
189   // TODO: When do plans get cleaned out?  Ever? In server open and in server
190   // shutdown processing -- St.Ack
191   // All access to this Map must be synchronized.
192   final NavigableMap<String, RegionPlan> regionPlans =
193     new TreeMap<String, RegionPlan>();
194 
195   private final TableStateManager tableStateManager;
196 
197   private final ExecutorService executorService;
198 
199   // For unit tests, keep track of calls to ClosedRegionHandler
200   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = null;
201 
202   // For unit tests, keep track of calls to OpenedRegionHandler
203   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = null;
204 
205   //Thread pool executor service for timeout monitor
206   private java.util.concurrent.ExecutorService threadPoolExecutorService;
207   private ScheduledThreadPoolExecutor scheduledThreadPoolExecutor;
208 
209   // A bunch of ZK events workers. Each is a single thread executor service
210   private final java.util.concurrent.ExecutorService zkEventWorkers;
211 
212   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
213       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
214 
215   private final RegionStates regionStates;
216 
217   // The threshold to use bulk assigning. Using bulk assignment
218   // only if assigning at least this many regions to at least this
219   // many servers. If assigning fewer regions to fewer servers,
220   // bulk assigning may be not as efficient.
221   private final int bulkAssignThresholdRegions;
222   private final int bulkAssignThresholdServers;
223   private final int bulkPerRegionOpenTimeGuesstimate;
224 
225   // Should bulk assignment wait till all regions are assigned,
226   // or it is timed out?  This is useful to measure bulk assignment
227   // performance, but not needed in most use cases.
228   private final boolean bulkAssignWaitTillAllAssigned;
229 
230   /**
231    * Indicator that AssignmentManager has recovered the region states so
232    * that ServerShutdownHandler can be fully enabled and re-assign regions
233    * of dead servers. So that when re-assignment happens, AssignmentManager
234    * has proper region states.
235    *
236    * Protected to ease testing.
237    */
238   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
239 
240   /**
241    * A map to track the count a region fails to open in a row.
242    * So that we don't try to open a region forever if the failure is
243    * unrecoverable.  We don't put this information in region states
244    * because we don't expect this to happen frequently; we don't
245    * want to copy this information over during each state transition either.
246    */
247   private final ConcurrentHashMap<String, AtomicInteger>
248     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
249 
250   // A flag to indicate if we are using ZK for region assignment
251   private final boolean useZKForAssignment;
252 
253   // In case not using ZK for region assignment, region states
254   // are persisted in meta with a state store
255   private final RegionStateStore regionStateStore;
256 
257   /**
258    * When the operator uses this configuration option, any version between
259    * the current cluster version and the value of "hbase.min.version.move.system.tables"
260    * does not trigger any auto-region movement. Auto-region movement here
261    * refers to auto-migration of system table regions to newer server versions.
262    * It is assumed that the configured range of versions does not require special
263    * handling of moving system table regions to higher versioned RegionServer.
264    * This auto-migration is done by {@link #checkIfShouldMoveSystemRegionAsync()}.
265    * Example: Let's assume the cluster is on version 1.4.0 and we have
266    * set "hbase.min.version.move.system.tables" as "2.0.0". Now if we upgrade
267    * one RegionServer on 1.4.0 cluster to 1.6.0 (< 2.0.0), then AssignmentManager will
268    * not move hbase:meta, hbase:namespace and other system table regions
269    * to newly brought up RegionServer 1.6.0 as part of auto-migration.
270    * However, if we upgrade one RegionServer on 1.4.0 cluster to 2.2.0 (> 2.0.0),
271    * then AssignmentManager will move all system table regions to newly brought
272    * up RegionServer 2.2.0 as part of auto-migration done by
273    * {@link #checkIfShouldMoveSystemRegionAsync()}.
274    * "hbase.min.version.move.system.tables" is introduced as part of HBASE-22923.
275    */
276   private final String minVersionToMoveSysTables;
277 
278   private static final String MIN_VERSION_MOVE_SYS_TABLES_CONFIG =
279       "hbase.min.version.move.system.tables";
280   private static final String DEFAULT_MIN_VERSION_MOVE_SYS_TABLES_CONFIG = "";
281 
282   /**
283    * For testing only!  Set to true to skip handling of split and merge.
284    */
285   private static boolean TEST_SKIP_SPLIT_HANDLING = false;
286   private static boolean TEST_SKIP_MERGE_HANDLING = false;
287 
288   /** Listeners that are called on assignment events. */
289   private List<AssignmentListener> listeners = new CopyOnWriteArrayList<AssignmentListener>();
290 
291   private RegionStateListener regionStateListener;
292 
293   public enum ServerHostRegion {
294     NOT_HOSTING_REGION, HOSTING_REGION, UNKNOWN,
295   }
296 
297   private RetryCounter.BackoffPolicy backoffPolicy;
298   private RetryCounter.RetryConfig retryConfig;
299 
300   private final Object checkIfShouldMoveSystemRegionLock = new Object();
301 
302   /**
303    * Constructs a new assignment manager.
304    *
305    * @param server instance of HMaster this AM running inside
306    * @param serverManager serverManager for associated HMaster
307    * @param balancer implementation of {@link LoadBalancer}
308    * @param service Executor service
309    * @param metricsMaster metrics manager
310    * @param tableLockManager TableLock manager
311    * @throws KeeperException
312    * @throws IOException
313    */
314   public AssignmentManager(MasterServices server, ServerManager serverManager,
315       final LoadBalancer balancer,
316       final ExecutorService service, MetricsMaster metricsMaster,
317       final TableLockManager tableLockManager) throws KeeperException,
318         IOException, CoordinatedStateException {
319     super(server.getZooKeeper());
320     this.server = server;
321     this.serverManager = serverManager;
322     this.executorService = service;
323     this.regionStateStore = new RegionStateStore(server);
324     this.regionsToReopen = Collections.synchronizedMap
325                            (new HashMap<String, HRegionInfo> ());
326     Configuration conf = server.getConfiguration();
327     // Only read favored nodes if using the favored nodes load balancer.
328     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
329            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
330            FavoredNodeLoadBalancer.class);
331     try {
332       if (server.getCoordinatedStateManager() != null) {
333         this.tableStateManager = server.getCoordinatedStateManager().getTableStateManager();
334       } else {
335         this.tableStateManager = null;
336       }
337     } catch (InterruptedException e) {
338       throw new InterruptedIOException();
339     }
340     // This is the max attempts, not retries, so it should be at least 1.
341     this.maximumAttempts = Math.max(1,
342       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10));
343     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
344         "hbase.meta.assignment.retry.sleeptime", 1000l);
345     this.balancer = balancer;
346     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
347 
348     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
349         maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
350 
351     this.scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1,
352         Threads.newDaemonThreadFactory("AM.Scheduler"));
353 
354     this.regionStates = new RegionStates(
355       server, tableStateManager, serverManager, regionStateStore);
356 
357     this.bulkAssignWaitTillAllAssigned =
358       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
359     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
360     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
361     this.bulkPerRegionOpenTimeGuesstimate =
362       conf.getInt("hbase.bulk.assignment.perregion.open.time", 10000);
363 
364     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
365     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
366     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
367             TimeUnit.SECONDS, threadFactory);
368     this.tableLockManager = tableLockManager;
369 
370     this.metricsAssignmentManager = new MetricsAssignmentManager();
371     useZKForAssignment = ConfigUtil.useZKForAssignment(conf);
372     // Configurations for retrying opening a region on receiving a FAILED_OPEN
373     this.retryConfig = new RetryCounter.RetryConfig();
374     this.retryConfig.setSleepInterval(conf.getLong("hbase.assignment.retry.sleep.initial", 0l));
375     // Set the max time limit to the initial sleep interval so we use a constant time sleep strategy
376     // if the user does not set a max sleep time
377     this.retryConfig.setMaxSleepTime(conf.getLong("hbase.assignment.retry.sleep.max",
378         retryConfig.getSleepInterval()));
379     this.backoffPolicy = getBackoffPolicy();
380 
381     int failedOpenRetryPeriod = conf.getInt(FAILED_OPEN_RETRY_KEY, FAILED_OPEN_RETRY_DEFAULT);
382     if (failedOpenRetryPeriod > 0) {
383       scheduledThreadPoolExecutor.scheduleWithFixedDelay(new FailedOpenRetryRunnable(),
384         failedOpenRetryPeriod, failedOpenRetryPeriod, TimeUnit.MILLISECONDS);
385     }
386     minVersionToMoveSysTables = conf.get(MIN_VERSION_MOVE_SYS_TABLES_CONFIG,
387         DEFAULT_MIN_VERSION_MOVE_SYS_TABLES_CONFIG);
388   }
389 
390   /**
391    * Returns the backoff policy used for Failed Region Open retries
392    * @return the backoff policy used for Failed Region Open retries
393    */
394   RetryCounter.BackoffPolicy getBackoffPolicy() {
395     return new RetryCounter.ExponentialBackoffPolicyWithLimit();
396   }
397 
398   MetricsAssignmentManager getAssignmentManagerMetrics() {
399     return this.metricsAssignmentManager;
400   }
401 
402   /**
403    * Add the listener to the notification list.
404    * @param listener The AssignmentListener to register
405    */
406   public void registerListener(final AssignmentListener listener) {
407     this.listeners.add(listener);
408   }
409 
410   /**
411    * Remove the listener from the notification list.
412    * @param listener The AssignmentListener to unregister
413    */
414   public boolean unregisterListener(final AssignmentListener listener) {
415     return this.listeners.remove(listener);
416   }
417 
418   /**
419    * @return Instance of ZKTableStateManager.
420    */
421   public TableStateManager getTableStateManager() {
422     // These are 'expensive' to make involving trip to zk ensemble so allow
423     // sharing.
424     return this.tableStateManager;
425   }
426 
427   /**
428    * This SHOULD not be public. It is public now
429    * because of some unit tests.
430    *
431    * TODO: make it package private and keep RegionStates in the master package
432    */
433   public RegionStates getRegionStates() {
434     return regionStates;
435   }
436 
437   /**
438    * Used in some tests to mock up region state in meta
439    */
440   RegionStateStore getRegionStateStore() {
441     return regionStateStore;
442   }
443 
444   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
445     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
446   }
447 
448   /**
449    * Add a regionPlan for the specified region.
450    * @param encodedName
451    * @param plan
452    */
453   public void addPlan(String encodedName, RegionPlan plan) {
454     synchronized (regionPlans) {
455       regionPlans.put(encodedName, plan);
456     }
457   }
458 
459   /**
460    * Add a map of region plans.
461    */
462   public void addPlans(Map<String, RegionPlan> plans) {
463     synchronized (regionPlans) {
464       regionPlans.putAll(plans);
465     }
466   }
467 
468   /**
469    * Set the list of regions that will be reopened
470    * because of an update in table schema
471    *
472    * @param regions
473    *          list of regions that should be tracked for reopen
474    */
475   public void setRegionsToReopen(List <HRegionInfo> regions) {
476     for(HRegionInfo hri : regions) {
477       regionsToReopen.put(hri.getEncodedName(), hri);
478     }
479   }
480 
481   /**
482    * Used by the client to identify if all regions have the schema updates
483    *
484    * @param tableName
485    * @return Pair indicating the status of the alter command
486    * @throws IOException
487    */
488   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
489       throws IOException {
490     List<HRegionInfo> hris;
491     if (TableName.META_TABLE_NAME.equals(tableName)) {
492       hris = new MetaTableLocator().getMetaRegions(server.getZooKeeper());
493     } else {
494       hris = MetaTableAccessor.getTableRegions(server.getZooKeeper(),
495         server.getConnection(), tableName, true);
496     }
497 
498     Integer pending = 0;
499     for (HRegionInfo hri : hris) {
500       String name = hri.getEncodedName();
501       // no lock concurrent access ok: sequential consistency respected.
502       if (regionsToReopen.containsKey(name)
503           || regionStates.isRegionInTransition(name)) {
504         pending++;
505       }
506     }
507     return new Pair<Integer, Integer>(pending, hris.size());
508   }
509 
510   /**
511    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
512    * the failover cleanup before re-assigning regions of dead servers. So that
513    * when re-assignment happens, AssignmentManager has proper region states.
514    */
515   public boolean isFailoverCleanupDone() {
516     return failoverCleanupDone.get();
517   }
518 
519   /**
520    * To avoid racing with AM, external entities may need to lock a region,
521    * for example, when SSH checks what regions to skip re-assigning.
522    */
523   public Lock acquireRegionLock(final String encodedName) {
524     return locker.acquireLock(encodedName);
525   }
526 
527   /**
528    * Now, failover cleanup is completed. Notify server manager to
529    * process queued up dead servers processing, if any.
530    */
531   void failoverCleanupDone() {
532     failoverCleanupDone.set(true);
533     serverManager.processQueuedDeadServers();
534   }
535 
536   /**
537    * Called on startup.
538    * Figures whether a fresh cluster start of we are joining extant running cluster.
539    * @throws IOException
540    * @throws KeeperException
541    * @throws InterruptedException
542    * @throws CoordinatedStateException
543    */
544   void joinCluster() throws IOException,
545       KeeperException, InterruptedException, CoordinatedStateException {
546     long startTime = System.currentTimeMillis();
547     // Concurrency note: In the below the accesses on regionsInTransition are
548     // outside of a synchronization block where usually all accesses to RIT are
549     // synchronized.  The presumption is that in this case it is safe since this
550     // method is being played by a single thread on startup.
551 
552     // TODO: Regions that have a null location and are not in regionsInTransitions
553     // need to be handled.
554 
555     // Scan hbase:meta to build list of existing regions, servers, and assignment
556     // Returns servers who have not checked in (assumed dead) that some regions
557     // were assigned to (according to the meta)
558     Set<ServerName> deadServers = rebuildUserRegions();
559 
560     // This method will assign all user regions if a clean server startup or
561     // it will reconstruct master state and cleanup any leftovers from previous master process.
562     boolean failover = processDeadServersAndRegionsInTransition(deadServers);
563 
564     if (!useZKForAssignment) {
565       // Not use ZK for assignment any more, remove the ZNode
566       ZKUtil.deleteNodeRecursively(watcher, watcher.assignmentZNode);
567     }
568     recoverTableInDisablingState();
569     recoverTableInEnablingState();
570     LOG.info("Joined the cluster in " + (System.currentTimeMillis()
571       - startTime) + "ms, failover=" + failover);
572   }
573 
574   /**
575    * Process all regions that are in transition in zookeeper and also
576    * processes the list of dead servers.
577    * Used by master joining an cluster.  If we figure this is a clean cluster
578    * startup, will assign all user regions.
579    * @param deadServers Set of servers that are offline probably legitimately that were carrying
580    * regions according to a scan of hbase:meta. Can be null.
581    * @throws KeeperException
582    * @throws IOException
583    * @throws InterruptedException
584    */
585   boolean processDeadServersAndRegionsInTransition(final Set<ServerName> deadServers)
586   throws KeeperException, IOException, InterruptedException, CoordinatedStateException {
587     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher, watcher.assignmentZNode);
588 
589     if (useZKForAssignment && nodes == null) {
590       String errorMessage = "Failed to get the children from ZK";
591       server.abort(errorMessage, new IOException(errorMessage));
592       return true; // Doesn't matter in this case
593     }
594 
595     boolean failover = !serverManager.getDeadServers().isEmpty();
596     if (failover) {
597       // This may not be a failover actually, especially if meta is on this master.
598       if (LOG.isDebugEnabled()) {
599         LOG.debug("Found dead servers out on cluster " + serverManager.getDeadServers());
600       }
601     } else {
602       // If any one region except meta is assigned, it's a failover.
603       Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
604       for (Map.Entry<HRegionInfo, ServerName> en:
605           regionStates.getRegionAssignments().entrySet()) {
606         HRegionInfo hri = en.getKey();
607         if (!hri.isMetaTable()
608             && onlineServers.contains(en.getValue())) {
609           LOG.debug("Found " + hri + " out on cluster");
610           failover = true;
611           break;
612         }
613       }
614       if (!failover && nodes != null) {
615         // If any one region except meta is in transition, it's a failover.
616         for (String encodedName: nodes) {
617           RegionState regionState = regionStates.getRegionState(encodedName);
618           if (regionState != null && !regionState.getRegion().isMetaRegion()) {
619             LOG.debug("Found " + regionState + " in RITs");
620             failover = true;
621             break;
622           }
623         }
624       }
625     }
626     if (!failover && !useZKForAssignment) {
627       // If any region except meta is in transition on a live server, it's a failover.
628       Set<RegionState> regionsInTransition = regionStates.getRegionsInTransition();
629       if (!regionsInTransition.isEmpty()) {
630         Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
631         for (RegionState regionState: regionsInTransition) {
632           ServerName serverName = regionState.getServerName();
633           if (!regionState.getRegion().isMetaRegion()
634               && serverName != null && onlineServers.contains(serverName)) {
635             LOG.debug("Found " + regionState + " in RITs");
636             failover = true;
637             break;
638           }
639         }
640       }
641     }
642     if (!failover) {
643       // If we get here, we have a full cluster restart. It is a failover only
644       // if there are some WALs are not split yet. For meta WALs, they should have
645       // been split already, if any. We can walk through those queued dead servers,
646       // if they don't have any WALs, this restart should be considered as a clean one
647       Set<ServerName> queuedDeadServers = serverManager.getRequeuedDeadServers().keySet();
648       if (!queuedDeadServers.isEmpty()) {
649         Configuration conf = server.getConfiguration();
650         Path walRootDir = FSUtils.getWALRootDir(conf);
651         FileSystem walFs = FSUtils.getWALFileSystem(conf);
652         for (ServerName serverName: queuedDeadServers) {
653           // In the case of a clean exit, the shutdown handler would have presplit any WALs and
654           // removed empty directories.
655           Path walDir = new Path(walRootDir,
656               DefaultWALProvider.getWALDirectoryName(serverName.toString()));
657           Path splitDir = walDir.suffix(DefaultWALProvider.SPLITTING_EXT);
658           if (walFs.exists(walDir) || walFs.exists(splitDir)) {
659             LOG.debug("Found queued dead server " + serverName);
660             failover = true;
661             break;
662           }
663         }
664         if (!failover) {
665           // We figured that it's not a failover, so no need to
666           // work on these re-queued dead servers any more.
667           LOG.info("AM figured that it's not a failover and cleaned up "
668             + queuedDeadServers.size() + " queued dead servers");
669           serverManager.removeRequeuedDeadServers();
670         }
671       }
672     }
673 
674     Map<HRegionInfo, ServerName> allRegions = null;
675     if (!failover) {
676       // Retrieve user regions except tables region that are in disabled/disabling/enabling states.
677       allRegions = getUserRegionsToAssign();
678     }
679 
680     // Now region states are restored
681     regionStateStore.start();
682 
683     // If we found user regions out on cluster, its a failover.
684     if (failover) {
685       LOG.info("Found regions out on cluster or in RIT; presuming failover");
686       // Process list of dead servers and regions in RIT.
687       // See HBASE-4580 for more information.
688       processDeadServersAndRecoverLostRegions(deadServers);
689 
690       // Handle the scenario when meta is rebuild by OfflineMetaRepair tool.
691       // In this scenario, meta will have only info:regioninfo entries (won't contain info:server)
692       // which lead SSH/SCP to skip holding region assignment.
693       if (!MetaTableAccessor.infoServerExists(server.getConnection())) {
694         // Need to assign the user region as a fresh startup, otherwise user region assignment will
695         // never happen
696         assignRegionsOnSSHCompletion();
697       }
698     }
699 
700     if (!failover && useZKForAssignment) {
701       // Cleanup any existing ZK nodes and start watching
702       ZKAssign.deleteAllNodes(watcher);
703       ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
704         this.watcher.assignmentZNode);
705     }
706 
707     // Now we can safely claim failover cleanup completed and enable
708     // ServerShutdownHandler for further processing. The nodes (below)
709     // in transition, if any, are for regions not related to those
710     // dead servers at all, and can be done in parallel to SSH.
711     failoverCleanupDone();
712     if (!failover) {
713       // Fresh cluster startup.
714       LOG.info("Clean cluster startup. Assigning user regions");
715       assignAllUserRegions(allRegions);
716     }
717     // unassign replicas of the split parents and the merged regions
718     // the daughter replicas are opened in assignAllUserRegions if it was
719     // not already opened.
720     for (HRegionInfo h : replicasToClose) {
721       unassign(h);
722     }
723     replicasToClose.clear();
724     return failover;
725   }
726 
727   /*
728    * At cluster clean re/start, mark all user regions closed except those of tables that are
729    * excluded, such as disabled/disabling/enabling tables. All user regions and their previous
730    * locations are returned.
731    */
732   private Map<HRegionInfo, ServerName> getUserRegionsToAssign()
733       throws InterruptedIOException, CoordinatedStateException {
734     Set<TableName> disabledOrDisablingOrEnabling =
735         tableStateManager.getTablesInStates(ZooKeeperProtos.Table.State.DISABLED,
736           ZooKeeperProtos.Table.State.DISABLING, ZooKeeperProtos.Table.State.ENABLING);
737 
738     // Clean re/start, mark all user regions closed before reassignment
739     return regionStates.closeAllUserRegions(disabledOrDisablingOrEnabling);
740   }
741 
742   /*
743    * Wait for SSH completion and assign user region which are not in disabled/disabling/enabling
744    * table states.
745    */
746   private void assignRegionsOnSSHCompletion() {
747     LOG.info("Meta is rebuild by OfflineMetaRepair tool, assigning all user regions.");
748     Thread regionAssignerThread = new Thread("RegionAssignerOnMetaRebuild") {
749       @Override
750       public void run() {
751         // Wait until all dead server processing finish
752         while (serverManager.areDeadServersInProgress()) {
753           try {
754             Thread.sleep(100);
755           } catch (InterruptedException e) {
756             LOG.warn("RegionAssignerOnMetaRebuild got interrupted.", e);
757             Thread.currentThread().interrupt();
758             return;
759           }
760         }
761         LOG.info("SSH has been completed for all dead servers, assigning user regions.");
762         try {
763           // Assign the regions
764           assignAllUserRegions(getUserRegionsToAssign());
765         } catch (CoordinatedStateException | IOException | InterruptedException e) {
766           LOG.error("Exception occured while assigning user regions.", e);
767         }
768       };
769     };
770     regionAssignerThread.setDaemon(true);
771     regionAssignerThread.start();
772   }
773 
774   /**
775    * If region is up in zk in transition, then do fixup and block and wait until
776    * the region is assigned and out of transition.  Used on startup for
777    * catalog regions.
778    * @param hri Region to look for.
779    * @return True if we processed a region in transition else false if region
780    * was not up in zk in transition.
781    * @throws InterruptedException
782    * @throws KeeperException
783    * @throws IOException
784    */
785   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
786       throws InterruptedException, KeeperException, IOException {
787     String encodedRegionName = hri.getEncodedName();
788     if (!processRegionInTransition(encodedRegionName, hri)) {
789       return false; // The region is not in transition
790     }
791     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(encodedRegionName));
792     while (!this.server.isStopped() &&
793         this.regionStates.isRegionInTransition(encodedRegionName)) {
794       RegionState state = this.regionStates.getRegionTransitionState(encodedRegionName);
795       if (state == null || !serverManager.isServerOnline(state.getServerName())) {
796         // The region is not in transition, or not in transition on an online
797         // server. Doesn't help to block here any more. Caller need to
798         // verify the region is actually assigned.
799         break;
800       }
801       this.regionStates.waitForUpdate(100);
802     }
803     return true;
804   }
805 
806   /**
807    * Process failover of new master for region <code>encodedRegionName</code>
808    * up in zookeeper.
809    * @param encodedRegionName Region to process failover for.
810    * @param regionInfo If null we'll go get it from meta table.
811    * @return True if we processed <code>regionInfo</code> as a RIT.
812    * @throws KeeperException
813    * @throws IOException
814    */
815   boolean processRegionInTransition(final String encodedRegionName,
816       final HRegionInfo regionInfo) throws KeeperException, IOException {
817     // We need a lock here to ensure that we will not put the same region twice
818     // It has no reason to be a lock shared with the other operations.
819     // We can do the lock on the region only, instead of a global lock: what we want to ensure
820     // is that we don't have two threads working on the same region.
821     Lock lock = locker.acquireLock(encodedRegionName);
822     try {
823       Stat stat = new Stat();
824       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
825       if (data == null) return false;
826       RegionTransition rt;
827       try {
828         rt = RegionTransition.parseFrom(data);
829       } catch (DeserializationException e) {
830         LOG.warn("Failed parse znode data", e);
831         return false;
832       }
833       HRegionInfo hri = regionInfo;
834       if (hri == null) {
835         // The region info is not passed in. We will try to find the region
836         // from region states map/meta based on the encoded region name. But we
837         // may not be able to find it. This is valid for online merge that
838         // the region may have not been created if the merge is not completed.
839         // Therefore, it is not in meta at master recovery time.
840         hri = regionStates.getRegionInfo(rt.getRegionName());
841         EventType et = rt.getEventType();
842         if (hri == null && et != EventType.RS_ZK_REGION_MERGING
843             && et != EventType.RS_ZK_REQUEST_REGION_MERGE) {
844           LOG.warn("Couldn't find the region in recovering " + rt);
845           return false;
846         }
847       }
848 
849       // TODO: This code is tied to ZK anyway, so for now leaving it as is,
850       // will refactor when whole region assignment will be abstracted from ZK
851       BaseCoordinatedStateManager cp =
852         (BaseCoordinatedStateManager) this.server.getCoordinatedStateManager();
853       OpenRegionCoordination openRegionCoordination = cp.getOpenRegionCoordination();
854 
855       ZkOpenRegionCoordination.ZkOpenRegionDetails zkOrd =
856         new ZkOpenRegionCoordination.ZkOpenRegionDetails();
857       zkOrd.setVersion(stat.getVersion());
858       zkOrd.setServerName(cp.getServer().getServerName());
859 
860       return processRegionsInTransition(
861         rt, hri, openRegionCoordination, zkOrd);
862     } finally {
863       lock.unlock();
864     }
865   }
866 
867   /**
868    * Retrieve HRegionInfo for given region name
869    *
870    * @param regionName Region name in byte[]
871    * @return HRegionInfo
872    */
873   public HRegionInfo getRegionInfo(final byte[] regionName) {
874     return regionStates.getRegionInfo(regionName);
875   }
876 
877   /**
878    * This call is invoked only (1) master assign meta;
879    * (2) during failover mode startup, zk assignment node processing.
880    * The locker is set in the caller. It returns true if the region
881    * is in transition for sure, false otherwise.
882    *
883    * It should be private but it is used by some test too.
884    */
885   boolean processRegionsInTransition(
886       final RegionTransition rt, final HRegionInfo regionInfo,
887       OpenRegionCoordination coordination,
888       final OpenRegionCoordination.OpenRegionDetails ord) throws KeeperException {
889     EventType et = rt.getEventType();
890     // Get ServerName.  Could not be null.
891     final ServerName sn = rt.getServerName();
892     final byte[] regionName = rt.getRegionName();
893     final String encodedName = HRegionInfo.encodeRegionName(regionName);
894     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
895     LOG.info("Processing " + prettyPrintedRegionName + " in state: " + et);
896 
897     if (regionStates.isRegionInTransition(encodedName)
898         && (regionInfo.isMetaRegion() || !useZKForAssignment)) {
899       LOG.info("Processed region " + prettyPrintedRegionName + " in state: "
900         + et + ", does nothing since the region is already in transition "
901         + regionStates.getRegionTransitionState(encodedName));
902       // Just return
903       return true;
904     }
905     if (!serverManager.isServerOnline(sn)) {
906       // It was transitioning on a dead server, so it's closed now.
907       // Force to OFFLINE and put it in transition, but not assign it
908       // since log splitting for the dead server is not done yet.
909       LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
910         " was on deadserver; forcing offline");
911       if (regionStates.isRegionOnline(regionInfo)) {
912         // Meta could still show the region is assigned to the previous
913         // server. If that server is online, when we reload the meta, the
914         // region is put back to online, we need to offline it.
915         regionStates.regionOffline(regionInfo);
916         sendRegionClosedNotification(regionInfo);
917       }
918       // Put it back in transition so that SSH can re-assign it
919       regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
920 
921       if (regionInfo.isMetaRegion()) {
922         // If it's meta region, reset the meta location.
923         // So that master knows the right meta region server.
924         MetaTableLocator.setMetaLocation(watcher, sn, State.OPEN);
925       } else {
926         // No matter the previous server is online or offline,
927         // we need to reset the last region server of the region.
928         regionStates.setLastRegionServerOfRegion(sn, encodedName);
929         // Make sure we know the server is dead.
930         if (!serverManager.isServerDead(sn)) {
931           serverManager.expireServer(sn);
932         }
933       }
934       return false;
935     }
936     switch (et) {
937       case M_ZK_REGION_CLOSING:
938         // Insert into RIT & resend the query to the region server: may be the previous master
939         // died before sending the query the first time.
940         final RegionState rsClosing = regionStates.updateRegionState(rt, State.CLOSING);
941         this.executorService.submit(
942           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
943             @Override
944             public void process() throws IOException {
945               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
946               try {
947                 final int expectedVersion = ((ZkOpenRegionCoordination.ZkOpenRegionDetails) ord)
948                   .getVersion();
949                 unassign(regionInfo, rsClosing, expectedVersion, null, useZKForAssignment, null);
950                 if (regionStates.isRegionOffline(regionInfo)) {
951                   assign(regionInfo, true);
952                 }
953               } finally {
954                 lock.unlock();
955               }
956             }
957           });
958         break;
959 
960       case RS_ZK_REGION_CLOSED:
961       case RS_ZK_REGION_FAILED_OPEN:
962         // Region is closed, insert into RIT and handle it
963         regionStates.setRegionStateTOCLOSED(regionInfo, sn);
964         if (!replicasToClose.contains(regionInfo)) {
965           invokeAssign(regionInfo);
966         } else {
967           offlineDisabledRegion(regionInfo);
968         }
969         break;
970 
971       case M_ZK_REGION_OFFLINE:
972         // Insert in RIT and resend to the regionserver
973         regionStates.updateRegionState(rt, State.OFFLINE);
974         final RegionState rsOffline = regionStates.getRegionState(regionInfo);
975         this.executorService.submit(
976           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
977             @Override
978             public void process() throws IOException {
979               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
980               try {
981                 RegionPlan plan = new RegionPlan(regionInfo, null, sn);
982                 addPlan(encodedName, plan);
983                 assign(rsOffline, true, false);
984               } finally {
985                 lock.unlock();
986               }
987             }
988           });
989         break;
990 
991       case RS_ZK_REGION_OPENING:
992         regionStates.updateRegionState(rt, State.OPENING);
993         break;
994 
995       case RS_ZK_REGION_OPENED:
996         // Region is opened, insert into RIT and handle it
997         // This could be done asynchronously, we would need then to acquire the lock in the
998         //  handler.
999         regionStates.updateRegionState(rt, State.OPEN);
1000         new OpenedRegionHandler(server, this, regionInfo, coordination, ord).process();
1001         break;
1002       case RS_ZK_REQUEST_REGION_SPLIT:
1003       case RS_ZK_REGION_SPLITTING:
1004       case RS_ZK_REGION_SPLIT:
1005         // Splitting region should be online. We could have skipped it during
1006         // user region rebuilding since we may consider the split is completed.
1007         // Put it in SPLITTING state to avoid complications.
1008         regionStates.regionOnline(regionInfo, sn);
1009         regionStates.updateRegionState(rt, State.SPLITTING);
1010         if (!handleRegionSplitting(
1011             rt, encodedName, prettyPrintedRegionName, sn)) {
1012           deleteSplittingNode(encodedName, sn);
1013         }
1014         break;
1015       case RS_ZK_REQUEST_REGION_MERGE:
1016       case RS_ZK_REGION_MERGING:
1017       case RS_ZK_REGION_MERGED:
1018         if (!handleRegionMerging(
1019             rt, encodedName, prettyPrintedRegionName, sn)) {
1020           deleteMergingNode(encodedName, sn);
1021         }
1022         break;
1023       default:
1024         throw new IllegalStateException("Received region in state:" + et + " is not valid.");
1025     }
1026     LOG.info("Processed region " + prettyPrintedRegionName + " in state "
1027       + et + ", on " + (serverManager.isServerOnline(sn) ? "" : "dead ")
1028       + "server: " + sn);
1029     return true;
1030   }
1031 
1032   /**
1033    * When a region is closed, it should be removed from the regionsToReopen
1034    * @param hri HRegionInfo of the region which was closed
1035    */
1036   public void removeClosedRegion(HRegionInfo hri) {
1037     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
1038       LOG.debug("Removed region from reopening regions because it was closed");
1039     }
1040   }
1041 
1042   /**
1043    * Handles various states an unassigned node can be in.
1044    * <p>
1045    * Method is called when a state change is suspected for an unassigned node.
1046    * <p>
1047    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
1048    * yet).
1049    * @param rt region transition
1050    * @param coordination coordination for opening region
1051    * @param ord details about opening region
1052    */
1053   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
1054       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
1055       justification="Needs work; says access to ConcurrentHashMaps not ATOMIC!!!")
1056   void handleRegion(final RegionTransition rt, OpenRegionCoordination coordination,
1057                     OpenRegionCoordination.OpenRegionDetails ord) {
1058     if (rt == null) {
1059       LOG.warn("Unexpected NULL input for RegionTransition rt");
1060       return;
1061     }
1062     final ServerName sn = rt.getServerName();
1063     // Check if this is a special HBCK transition
1064     if (sn.equals(HBCK_CODE_SERVERNAME)) {
1065       handleHBCK(rt);
1066       return;
1067     }
1068     final long createTime = rt.getCreateTime();
1069     final byte[] regionName = rt.getRegionName();
1070     String encodedName = HRegionInfo.encodeRegionName(regionName);
1071     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
1072     // Verify this is a known server
1073     if (!serverManager.isServerOnline(sn)
1074       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
1075       LOG.warn("Attempted to handle region transition for server but " +
1076         "it is not online: " + prettyPrintedRegionName + ", " + rt);
1077       return;
1078     }
1079 
1080     RegionState regionState =
1081       regionStates.getRegionState(encodedName);
1082     long startTime = System.currentTimeMillis();
1083     if (LOG.isDebugEnabled()) {
1084       boolean lateEvent = createTime < (startTime - 15000);
1085       LOG.debug("Handling " + rt.getEventType() +
1086         ", server=" + sn + ", region=" +
1087         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
1088         (lateEvent ? ", which is more than 15 seconds late" : "") +
1089         ", current_state=" + regionState);
1090     }
1091     // We don't do anything for this event,
1092     // so separate it out, no need to lock/unlock anything
1093     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
1094       return;
1095     }
1096 
1097     // We need a lock on the region as we could update it
1098     Lock lock = locker.acquireLock(encodedName);
1099     try {
1100       RegionState latestState =
1101         regionStates.getRegionState(encodedName);
1102       if ((regionState == null && latestState != null)
1103           || (regionState != null && latestState == null)
1104           || (regionState != null && latestState != null
1105             && latestState.getState() != regionState.getState())) {
1106         LOG.warn("Region state changed from " + regionState + " to "
1107           + latestState + ", while acquiring lock");
1108       }
1109       long waitedTime = System.currentTimeMillis() - startTime;
1110       if (waitedTime > 5000) {
1111         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
1112       }
1113       regionState = latestState;
1114       switch (rt.getEventType()) {
1115       case RS_ZK_REQUEST_REGION_SPLIT:
1116       case RS_ZK_REGION_SPLITTING:
1117       case RS_ZK_REGION_SPLIT:
1118         // If region split not enabled then skip only if event type is RS_ZK_REQUEST_REGION_SPLIT,
1119         // allow on-going split operations
1120         if ((!isRegionSplitOrMergeEnabled(rt, prettyPrintedRegionName, MasterSwitchType.SPLIT)
1121             && rt.getEventType() == EventType.RS_ZK_REQUEST_REGION_SPLIT)
1122             || !handleRegionSplitting(rt, encodedName, prettyPrintedRegionName, sn)) {
1123           deleteSplittingNode(encodedName, sn);
1124         }
1125         break;
1126 
1127       case RS_ZK_REQUEST_REGION_MERGE:
1128       case RS_ZK_REGION_MERGING:
1129       case RS_ZK_REGION_MERGED:
1130         // Merged region is a new region, we can't find it in the region states now.
1131         // However, the two merging regions are not new. They should be in state for merging.
1132         // If region merge not enabled then skip only if event type is RS_ZK_REQUEST_REGION_MERGE,
1133         // allow on-going merge operations
1134         if ((!isRegionSplitOrMergeEnabled(rt, prettyPrintedRegionName, MasterSwitchType.MERGE)
1135             && rt.getEventType() == EventType.RS_ZK_REQUEST_REGION_MERGE)
1136             || !handleRegionMerging(rt, encodedName, prettyPrintedRegionName, sn)) {
1137           deleteMergingNode(encodedName, sn);
1138         }
1139         break;
1140 
1141       case M_ZK_REGION_CLOSING:
1142         // Should see CLOSING after we have asked it to CLOSE or additional
1143         // times after already being in state of CLOSING
1144         if (regionState == null
1145             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
1146           LOG.warn("Received CLOSING for " + prettyPrintedRegionName
1147             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
1148             + regionStates.getRegionState(encodedName));
1149           return;
1150         }
1151         // Transition to CLOSING (or update stamp if already CLOSING)
1152         regionStates.updateRegionState(rt, State.CLOSING);
1153         break;
1154 
1155       case RS_ZK_REGION_CLOSED:
1156         // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
1157         if (regionState == null
1158             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
1159           LOG.warn("Received CLOSED for " + prettyPrintedRegionName
1160             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
1161             + regionStates.getRegionState(encodedName));
1162           return;
1163         }
1164         // Handle CLOSED by assigning elsewhere or stopping if a disable
1165         // If we got here all is good.  Need to update RegionState -- else
1166         // what follows will fail because not in expected state.
1167         new ClosedRegionHandler(server, this, regionState.getRegion()).process();
1168         updateClosedRegionHandlerTracker(regionState.getRegion());
1169         break;
1170 
1171         case RS_ZK_REGION_FAILED_OPEN:
1172           if (regionState == null
1173               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1174             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
1175               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1176               + regionStates.getRegionState(encodedName));
1177             return;
1178           }
1179           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
1180           if (failedOpenCount == null) {
1181             failedOpenCount = new AtomicInteger();
1182             // No need to use putIfAbsent, or extra synchronization since
1183             // this whole handleRegion block is locked on the encoded region
1184             // name, and failedOpenTracker is updated only in this block
1185             // FindBugs: AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION
1186             failedOpenTracker.put(encodedName, failedOpenCount);
1187           }
1188           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
1189             // FindBugs: AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION
1190             regionStates.updateRegionState(rt, State.FAILED_OPEN);
1191             // remove the tracking info to save memory, also reset
1192             // the count for next open initiative
1193             failedOpenTracker.remove(encodedName);
1194           } else {
1195             // Handle this the same as if it were opened and then closed.
1196             regionState = regionStates.setRegionStateTOCLOSED(rt.getRegionName(), sn);
1197             if (regionState != null) {
1198               // When there are more than one region server a new RS is selected as the
1199               // destination and the same is updated in the regionplan. (HBASE-5546)
1200               getRegionPlan(regionState.getRegion(), sn, true);
1201               new ClosedRegionHandler(server, this, regionState.getRegion()).process();
1202             }
1203           }
1204           break;
1205 
1206         case RS_ZK_REGION_OPENING:
1207           // Should see OPENING after we have asked it to OPEN or additional
1208           // times after already being in state of OPENING
1209           if (regionState == null
1210               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1211             LOG.warn("Received OPENING for " + prettyPrintedRegionName
1212               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1213               + regionStates.getRegionState(encodedName));
1214             return;
1215           }
1216           // Transition to OPENING (or update stamp if already OPENING)
1217           regionStates.updateRegionState(rt, State.OPENING);
1218           break;
1219 
1220         case RS_ZK_REGION_OPENED:
1221           // Should see OPENED after OPENING but possible after PENDING_OPEN.
1222           if (regionState == null
1223               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1224             LOG.warn("Received OPENED for " + prettyPrintedRegionName
1225               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1226               + regionStates.getRegionState(encodedName));
1227 
1228             if (regionState != null) {
1229               if(regionState.isOpened() && regionState.getServerName().equals(sn)) {
1230                 //if this region was opened before on this rs, we don't have to unassign it. It won't cause
1231                 //double assign. One possible scenario of what happened is HBASE-17275
1232                 failedOpenTracker.remove(encodedName); // reset the count, if any
1233                 new OpenedRegionHandler(
1234                     server, this, regionState.getRegion(), coordination, ord).process();
1235                 updateOpenedRegionHandlerTracker(regionState.getRegion());
1236               } else {
1237                 // Close it without updating the internal region states,
1238                 // so as not to create double assignments in unlucky scenarios
1239                 // mentioned in OpenRegionHandler#process
1240                 unassign(regionState.getRegion(), null, -1, null, false, sn);
1241               }
1242             }
1243             return;
1244           }
1245           // Handle OPENED by removing from transition and deleted zk node
1246           regionState =
1247               regionStates.transitionOpenFromPendingOpenOrOpeningOnServer(rt,regionState, sn);
1248           if (regionState != null) {
1249             failedOpenTracker.remove(encodedName); // reset the count, if any
1250             new OpenedRegionHandler(
1251               server, this, regionState.getRegion(), coordination, ord).process();
1252             updateOpenedRegionHandlerTracker(regionState.getRegion());
1253           }
1254           break;
1255 
1256         default:
1257           throw new IllegalStateException("Received event is not valid.");
1258       }
1259     } finally {
1260       lock.unlock();
1261     }
1262   }
1263 
1264   /**
1265    * Check whether region split or merge enabled.
1266    * @param rt Region transition info
1267    * @param prettyPrintedRegionName Region name
1268    * @param switchType Region operation type
1269    * @param eventType Event type
1270    * @return true if region split/merge enabled
1271    */
1272   private boolean isRegionSplitOrMergeEnabled(RegionTransition rt, String prettyPrintedRegionName,
1273       MasterSwitchType switchType) {
1274     if (!((HMaster) server).getSplitOrMergeTracker().isSplitOrMergeEnabled(switchType)) {
1275       LOG.warn("Region " + switchType + " not enabled, skipping " + rt.getEventType()
1276           + " of reigon " + prettyPrintedRegionName);
1277       return false;
1278     }
1279     return true;
1280   }
1281 
1282   // For unit tests only
1283   boolean wasClosedHandlerCalled(HRegionInfo hri) {
1284     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
1285     //compareAndSet to be sure that unit tests don't see stale values. Means,
1286     //we will return true exactly once unless the handler code resets to true
1287     //this value.
1288     return b == null ? false : b.compareAndSet(true, false);
1289   }
1290 
1291   //For unit tests only
1292   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
1293     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
1294     //compareAndSet to be sure that unit tests don't see stale values. Means,
1295     //we will return true exactly once unless the handler code resets to true
1296     //this value.
1297     return b == null ? false : b.compareAndSet(true, false);
1298   }
1299 
1300   //For unit tests only
1301   void initializeHandlerTrackers() {
1302     closedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
1303     openedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
1304   }
1305 
1306   void updateClosedRegionHandlerTracker(HRegionInfo hri) {
1307     if (closedRegionHandlerCalled != null) { //only for unit tests this is true
1308       closedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
1309     }
1310   }
1311 
1312   void updateOpenedRegionHandlerTracker(HRegionInfo hri) {
1313     if (openedRegionHandlerCalled != null) { //only for unit tests this is true
1314       openedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
1315     }
1316   }
1317 
1318   // TODO: processFavoredNodes might throw an exception, for e.g., if the
1319   // meta could not be contacted/updated. We need to see how seriously to treat
1320   // this problem as. Should we fail the current assignment. We should be able
1321   // to recover from this problem eventually (if the meta couldn't be updated
1322   // things should work normally and eventually get fixed up).
1323   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
1324     if (!shouldAssignRegionsWithFavoredNodes) return;
1325     // The AM gets the favored nodes info for each region and updates the meta
1326     // table with that info
1327     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1328         new HashMap<HRegionInfo, List<ServerName>>();
1329     for (HRegionInfo region : regions) {
1330       regionToFavoredNodes.put(region,
1331           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1332     }
1333     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes,
1334       this.server.getConnection());
1335   }
1336 
1337   /**
1338    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1339    * <p>
1340    * This is handled in a separate code path because it breaks the normal rules.
1341    * @param rt
1342    */
1343   @SuppressWarnings("deprecation")
1344   private void handleHBCK(RegionTransition rt) {
1345     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1346     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1347       ", server=" + rt.getServerName() + ", region=" +
1348       HRegionInfo.prettyPrint(encodedName));
1349     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1350     switch (rt.getEventType()) {
1351       case M_ZK_REGION_OFFLINE:
1352         HRegionInfo regionInfo;
1353         if (regionState != null) {
1354           regionInfo = regionState.getRegion();
1355         } else {
1356           try {
1357             byte [] name = rt.getRegionName();
1358             Pair<HRegionInfo, ServerName> p = MetaTableAccessor.getRegion(
1359               this.server.getConnection(), name);
1360             regionInfo = p.getFirst();
1361           } catch (IOException e) {
1362             LOG.info("Exception reading hbase:meta doing HBCK repair operation", e);
1363             return;
1364           }
1365         }
1366         LOG.info("HBCK repair is triggering assignment of region=" +
1367             regionInfo.getRegionNameAsString());
1368         // trigger assign, node is already in OFFLINE so don't need to update ZK
1369         assign(regionInfo, false);
1370         break;
1371 
1372       default:
1373         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1374         break;
1375     }
1376 
1377   }
1378 
1379   // ZooKeeper events
1380 
1381   /**
1382    * New unassigned node has been created.
1383    *
1384    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1385    * creating an unassigned node.
1386    *
1387    * <p>When this happens we must:
1388    * <ol>
1389    *   <li>Watch the node for further events</li>
1390    *   <li>Read and handle the state in the node</li>
1391    * </ol>
1392    */
1393   @Override
1394   public void nodeCreated(String path) {
1395     handleAssignmentEvent(path);
1396   }
1397 
1398   /**
1399    * Existing unassigned node has had data changed.
1400    *
1401    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1402    * OPENING/OPENED and CLOSING/CLOSED.
1403    *
1404    * <p>When this happens we must:
1405    * <ol>
1406    *   <li>Watch the node for further events</li>
1407    *   <li>Read and handle the state in the node</li>
1408    * </ol>
1409    */
1410   @Override
1411   public void nodeDataChanged(String path) {
1412     handleAssignmentEvent(path);
1413   }
1414 
1415 
1416   // We  don't want to have two events on the same region managed simultaneously.
1417   // For this reason, we need to wait if an event on the same region is currently in progress.
1418   // So we track the region names of the events in progress, and we keep a waiting list.
1419   private final Set<String> regionsInProgress = new HashSet<String>();
1420   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1421   //  this as we want the events to be managed in the same order as we received them.
1422   private final LinkedHashMultimap <String, RegionRunnable>
1423       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1424 
1425   /**
1426    * A specific runnable that works only on a region.
1427    */
1428   private interface RegionRunnable extends Runnable{
1429     /**
1430      * @return - the name of the region it works on.
1431      */
1432     String getRegionName();
1433   }
1434 
1435   /**
1436    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1437    * Order is respected.
1438    */
1439   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1440 
1441     synchronized (regionsInProgress) {
1442       // If we're there is already a task with this region, we add it to the
1443       //  waiting list and return.
1444       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1445         synchronized (zkEventWorkerWaitingList){
1446           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1447         }
1448         return;
1449       }
1450 
1451       // No event in progress on this region => we can submit a new task immediately.
1452       regionsInProgress.add(regRunnable.getRegionName());
1453       zkEventWorkers.submit(new Runnable() {
1454         @Override
1455         public void run() {
1456           try {
1457             regRunnable.run();
1458           } finally {
1459             // now that we have finished, let's see if there is an event for the same region in the
1460             //  waiting list. If it's the case, we can now submit it to the pool.
1461             synchronized (regionsInProgress) {
1462               regionsInProgress.remove(regRunnable.getRegionName());
1463               synchronized (zkEventWorkerWaitingList) {
1464                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1465                     regRunnable.getRegionName());
1466                 if (!waiting.isEmpty()) {
1467                   // We want the first object only. The only way to get it is through an iterator.
1468                   RegionRunnable toSubmit = waiting.iterator().next();
1469                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1470                   zkEventWorkersSubmit(toSubmit);
1471                 }
1472               }
1473             }
1474           }
1475         }
1476       });
1477     }
1478   }
1479 
1480   @Override
1481   public void nodeDeleted(final String path) {
1482     if (path.startsWith(watcher.assignmentZNode)) {
1483       final String regionName = ZKAssign.getRegionName(watcher, path);
1484       zkEventWorkersSubmit(new RegionRunnable() {
1485         @Override
1486         public String getRegionName() {
1487           return regionName;
1488         }
1489 
1490         @Override
1491         public void run() {
1492           Lock lock = locker.acquireLock(regionName);
1493           try {
1494             RegionState rs = regionStates.getRegionTransitionState(regionName);
1495             if (rs == null) {
1496               rs = regionStates.getRegionState(regionName);
1497               if (rs == null || !rs.isMergingNew()) {
1498                 // MergingNew is an offline state
1499                 return;
1500               }
1501             }
1502 
1503             HRegionInfo regionInfo = rs.getRegion();
1504             String regionNameStr = regionInfo.getRegionNameAsString();
1505             LOG.debug("Znode " + regionNameStr + " deleted, state: " + rs);
1506 
1507             boolean disabled = getTableStateManager().isTableState(regionInfo.getTable(),
1508                 ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING);
1509 
1510             ServerName serverName = rs.getServerName();
1511             if (serverManager.isServerOnline(serverName)) {
1512               if (rs.isOnServer(serverName) && (rs.isOpened() || rs.isSplitting())) {
1513                 synchronized (regionStates) {
1514                   regionOnline(regionInfo, serverName);
1515                   if (rs.isSplitting() && splitRegions.containsKey(regionInfo)) {
1516                     // Check if the daugter regions are still there, if they are present, offline
1517                     // as its the case of a rollback.
1518                     HRegionInfo hri_a = splitRegions.get(regionInfo).getFirst();
1519                     HRegionInfo hri_b = splitRegions.get(regionInfo).getSecond();
1520                     if (!regionStates.isRegionInTransition(hri_a.getEncodedName())) {
1521                       LOG.warn("Split daughter region not in transition " + hri_a);
1522                     }
1523                     if (!regionStates.isRegionInTransition(hri_b.getEncodedName())) {
1524                       LOG.warn("Split daughter region not in transition" + hri_b);
1525                     }
1526                     regionOffline(hri_a);
1527                     regionOffline(hri_b);
1528                     splitRegions.remove(regionInfo);
1529                   }
1530                   if (disabled) {
1531                     // if server is offline, no hurt to unassign again
1532                     LOG.info("Opened " + regionNameStr
1533                         + "but this table is disabled, triggering close of region");
1534                     unassign(regionInfo);
1535                   }
1536                 }
1537               } else if (rs.isMergingNew()) {
1538                 synchronized (regionStates) {
1539                   String p = regionInfo.getEncodedName();
1540                   PairOfSameType<HRegionInfo> regions = mergingRegions.get(p);
1541                   if (regions != null) {
1542                     onlineMergingRegion(disabled, regions.getFirst(), serverName);
1543                     onlineMergingRegion(disabled, regions.getSecond(), serverName);
1544                   }
1545                 }
1546               }
1547             }
1548           } finally {
1549             lock.unlock();
1550           }
1551         }
1552 
1553         private void onlineMergingRegion(boolean disabled,
1554             final HRegionInfo hri, final ServerName serverName) {
1555           RegionState regionState = regionStates.getRegionState(hri);
1556           if (regionState != null && regionState.isMerging()
1557               && regionState.isOnServer(serverName)) {
1558             regionOnline(regionState.getRegion(), serverName);
1559             if (disabled) {
1560               unassign(hri);
1561             }
1562           }
1563         }
1564       });
1565     }
1566   }
1567 
1568   /**
1569    * New unassigned node has been created.
1570    *
1571    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1572    * region by creating a znode.
1573    *
1574    * <p>When this happens we must:
1575    * <ol>
1576    *   <li>Watch the node for further children changed events</li>
1577    *   <li>Watch all new children for changed events</li>
1578    * </ol>
1579    */
1580   @Override
1581   public void nodeChildrenChanged(String path) {
1582     if (path.equals(watcher.assignmentZNode)) {
1583       zkEventWorkers.submit(new Runnable() {
1584         @Override
1585         public void run() {
1586           try {
1587             // Just make sure we see the changes for the new znodes
1588             List<String> children =
1589               ZKUtil.listChildrenAndWatchForNewChildren(
1590                 watcher, watcher.assignmentZNode);
1591             if (children != null) {
1592               Stat stat = new Stat();
1593               for (String child : children) {
1594                 // if region is in transition, we already have a watch
1595                 // on it, so no need to watch it again. So, as I know for now,
1596                 // this is needed to watch splitting nodes only.
1597                 if (!regionStates.isRegionInTransition(child)) {
1598                   ZKAssign.getDataAndWatch(watcher, child, stat);
1599                 }
1600               }
1601             }
1602           } catch (KeeperException e) {
1603             server.abort("Unexpected ZK exception reading unassigned children", e);
1604           }
1605         }
1606       });
1607     }
1608   }
1609 
1610 
1611   /**
1612    * Marks the region as online.  Removes it from regions in transition and
1613    * updates the in-memory assignment information.
1614    * <p>
1615    * Used when a region has been successfully opened on a region server.
1616    * @param regionInfo
1617    * @param sn
1618    */
1619   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1620     regionOnline(regionInfo, sn, HConstants.NO_SEQNUM);
1621   }
1622 
1623   void regionOnline(HRegionInfo regionInfo, ServerName sn, long openSeqNum) {
1624     numRegionsOpened.incrementAndGet();
1625     regionStates.regionOnline(regionInfo, sn, openSeqNum);
1626 
1627     // Remove plan if one.
1628     clearRegionPlan(regionInfo);
1629     balancer.regionOnline(regionInfo, sn);
1630 
1631     // Tell our listeners that a region was opened
1632     sendRegionOpenedNotification(regionInfo, sn);
1633   }
1634 
1635   /**
1636    * Pass the assignment event to a worker for processing.
1637    * Each worker is a single thread executor service.  The reason
1638    * for just one thread is to make sure all events for a given
1639    * region are processed in order.
1640    *
1641    * @param path
1642    */
1643   private void handleAssignmentEvent(final String path) {
1644     if (path.startsWith(watcher.assignmentZNode)) {
1645       final String regionName = ZKAssign.getRegionName(watcher, path);
1646 
1647       zkEventWorkersSubmit(new RegionRunnable() {
1648         @Override
1649         public String getRegionName() {
1650           return regionName;
1651         }
1652 
1653         @Override
1654         public void run() {
1655           try {
1656             Stat stat = new Stat();
1657             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1658             if (data == null) return;
1659 
1660             RegionTransition rt = RegionTransition.parseFrom(data);
1661 
1662             // TODO: This code is tied to ZK anyway, so for now leaving it as is,
1663             // will refactor when whole region assignment will be abstracted from ZK
1664             BaseCoordinatedStateManager csm =
1665               (BaseCoordinatedStateManager) server.getCoordinatedStateManager();
1666             OpenRegionCoordination openRegionCoordination = csm.getOpenRegionCoordination();
1667 
1668             ZkOpenRegionCoordination.ZkOpenRegionDetails zkOrd =
1669               new ZkOpenRegionCoordination.ZkOpenRegionDetails();
1670             zkOrd.setVersion(stat.getVersion());
1671             zkOrd.setServerName(csm.getServer().getServerName());
1672 
1673             handleRegion(rt, openRegionCoordination, zkOrd);
1674           } catch (KeeperException e) {
1675             server.abort("Unexpected ZK exception reading unassigned node data", e);
1676           } catch (DeserializationException e) {
1677             server.abort("Unexpected exception deserializing node data", e);
1678           }
1679         }
1680       });
1681     }
1682   }
1683 
1684   /**
1685    * Marks the region as offline.  Removes it from regions in transition and
1686    * removes in-memory assignment information.
1687    * <p>
1688    * Used when a region has been closed and should remain closed.
1689    * @param regionInfo
1690    */
1691   public void regionOffline(final HRegionInfo regionInfo) {
1692     if (regionStates.isRegionInState(regionInfo, State.MERGED, State.SPLIT)) {
1693       LOG.info("Try to offline region " + regionInfo.getEncodedName() +
1694           ", which is at state " + regionStates.getRegionState(regionInfo).getState() + ", skip");
1695       return;
1696     }
1697     regionOffline(regionInfo, null);
1698   }
1699 
1700   /**
1701    * Marks the region as offline. In addition whether removing it from
1702    * replicas and master in-memory server holding map.
1703    * <p>
1704    * @param regionInfo - region info.
1705    * @param force - setting to true to force this region to be removed from replicas and master
1706    *   in-memory server holding map, to make this region not be re-opened on any other region
1707    *   servers. The only use case is hbck for now.
1708    */
1709   public void regionOffline(final HRegionInfo regionInfo, boolean force) {
1710     regionOffline(regionInfo, null, force);
1711   }
1712 
1713   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1714     if (useZKForAssignment) {
1715       // Disabling so should not be reassigned, just delete the CLOSED node
1716       LOG.debug("Table being disabled so deleting ZK node and removing from " +
1717         "regions in transition, skipping assignment of region " +
1718           regionInfo.getRegionNameAsString());
1719       String encodedName = regionInfo.getEncodedName();
1720       deleteNodeInStates(encodedName, "closed", null,
1721         EventType.RS_ZK_REGION_CLOSED, EventType.M_ZK_REGION_OFFLINE);
1722     }
1723     replicasToClose.remove(regionInfo);
1724     //Set servername in regionstate to null, see HBASE-18014
1725     getRegionStates().updateRegionState(regionInfo, State.OFFLINE, null);
1726     regionOffline(regionInfo);
1727   }
1728 
1729   // Assignment methods
1730 
1731   /**
1732    * Assigns the specified region.
1733    * <p>
1734    * If a RegionPlan is available with a valid destination then it will be used
1735    * to determine what server region is assigned to.  If no RegionPlan is
1736    * available, region will be assigned to a random available server.
1737    * <p>
1738    * Updates the RegionState and sends the OPEN RPC.
1739    * <p>
1740    * This will only succeed if the region is in transition and in a CLOSED or
1741    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1742    * chosen server is up and running (It may have just crashed!).  If the
1743    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1744    *
1745    * @param region server to be assigned
1746    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1747    *                       OFFLINE state before assigning the region
1748    */
1749   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1750     assign(region, setOfflineInZK, false);
1751   }
1752 
1753   /**
1754    * Use care with forceNewPlan. It could cause double assignment.
1755    */
1756   public void assign(HRegionInfo region,
1757       boolean setOfflineInZK, boolean forceNewPlan) {
1758     if (isDisabledorDisablingRegionInRIT(region)) {
1759       return;
1760     }
1761     String encodedName = region.getEncodedName();
1762     Lock lock = locker.acquireLock(encodedName);
1763     try {
1764       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1765       if (state != null) {
1766         if (regionStates.wasRegionOnDeadServer(encodedName)) {
1767           LOG.info("Skip assigning " + region.getRegionNameAsString()
1768             + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1769             + " is dead but not processed yet");
1770           return;
1771         }
1772         assign(state, setOfflineInZK && useZKForAssignment, forceNewPlan);
1773       }
1774     } finally {
1775       lock.unlock();
1776     }
1777   }
1778 
1779   /**
1780    * Bulk assign regions to <code>destination</code>.
1781    * @param destination
1782    * @param regions Regions to assign.
1783    * @return true if successful
1784    */
1785   boolean assign(final ServerName destination, final List<HRegionInfo> regions)
1786     throws InterruptedException {
1787     long startTime = EnvironmentEdgeManager.currentTime();
1788     try {
1789       int regionCount = regions.size();
1790       if (regionCount == 0) {
1791         return true;
1792       }
1793       LOG.info("Assigning " + regionCount + " region(s) to " + destination.toString());
1794       Set<String> encodedNames = new HashSet<String>(regionCount);
1795       for (HRegionInfo region : regions) {
1796         encodedNames.add(region.getEncodedName());
1797       }
1798 
1799       List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1800       Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1801       try {
1802         AtomicInteger counter = new AtomicInteger(0);
1803         Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1804         OfflineCallback cb = new OfflineCallback(
1805           watcher, destination, counter, offlineNodesVersions);
1806         Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1807         List<RegionState> states = new ArrayList<RegionState>(regions.size());
1808         for (HRegionInfo region : regions) {
1809           String encodedName = region.getEncodedName();
1810           if (!isDisabledorDisablingRegionInRIT(region)) {
1811             RegionState state = forceRegionStateToOffline(region, false);
1812             boolean onDeadServer = false;
1813             if (state != null) {
1814               if (regionStates.wasRegionOnDeadServer(encodedName)) {
1815                 LOG.info("Skip assigning " + region.getRegionNameAsString()
1816                   + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1817                   + " is dead but not processed yet");
1818                 onDeadServer = true;
1819               } else if (!useZKForAssignment
1820                   || asyncSetOfflineInZooKeeper(state, cb, destination)) {
1821                 RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1822                 plans.put(encodedName, plan);
1823                 states.add(state);
1824                 continue;
1825               }
1826             }
1827             // Reassign if the region wasn't on a dead server
1828             if (!onDeadServer) {
1829               LOG.info("failed to force region state to offline or "
1830                 + "failed to set it offline in ZK, will reassign later: " + region);
1831               failedToOpenRegions.add(region); // assign individually later
1832             }
1833           }
1834           // Release the lock, this region is excluded from bulk assign because
1835           // we can't update its state, or set its znode to offline.
1836           Lock lock = locks.remove(encodedName);
1837           lock.unlock();
1838         }
1839 
1840         if (useZKForAssignment) {
1841           // Wait until all unassigned nodes have been put up and watchers set.
1842           int total = states.size();
1843           for (int oldCounter = 0; !server.isStopped();) {
1844             int count = counter.get();
1845             if (oldCounter != count) {
1846               LOG.debug(destination.toString() + " unassigned znodes=" + count +
1847                 " of total=" + total + "; oldCounter=" + oldCounter);
1848               oldCounter = count;
1849             }
1850             if (count >= total) break;
1851             Thread.sleep(5);
1852           }
1853         }
1854 
1855         if (server.isStopped()) {
1856           return false;
1857         }
1858 
1859         // Add region plans, so we can updateTimers when one region is opened so
1860         // that unnecessary timeout on RIT is reduced.
1861         this.addPlans(plans);
1862 
1863         List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1864           new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1865         for (RegionState state: states) {
1866           HRegionInfo region = state.getRegion();
1867           String encodedRegionName = region.getEncodedName();
1868           Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1869           if (useZKForAssignment && (nodeVersion == null || nodeVersion == -1)) {
1870             LOG.warn("failed to offline in zookeeper: " + region);
1871             failedToOpenRegions.add(region); // assign individually later
1872             Lock lock = locks.remove(encodedRegionName);
1873             lock.unlock();
1874           } else {
1875             regionStates.updateRegionState(
1876               region, State.PENDING_OPEN, destination);
1877             List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1878             if (this.shouldAssignRegionsWithFavoredNodes) {
1879               favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1880             }
1881             regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1882               region, nodeVersion, favoredNodes));
1883           }
1884         }
1885 
1886         // Move on to open regions.
1887         try {
1888           // Send OPEN RPC. If it fails on a IOE or RemoteException,
1889           // regions will be assigned individually.
1890           long maxWaitTime = System.currentTimeMillis() +
1891             this.server.getConfiguration().
1892               getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1893           for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1894             try {
1895               // regionOpenInfos is empty if all regions are in failedToOpenRegions list
1896               if (regionOpenInfos.isEmpty()) {
1897                 break;
1898               }
1899               List<RegionOpeningState> regionOpeningStateList = serverManager
1900                 .sendRegionOpen(destination, regionOpenInfos);
1901               if (regionOpeningStateList == null) {
1902                 // Failed getting RPC connection to this server
1903                 return false;
1904               }
1905               for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1906                 RegionOpeningState openingState = regionOpeningStateList.get(k);
1907                 if (openingState != RegionOpeningState.OPENED) {
1908                   HRegionInfo region = regionOpenInfos.get(k).getFirst();
1909                   if (openingState == RegionOpeningState.ALREADY_OPENED) {
1910                     processAlreadyOpenedRegion(region, destination);
1911                   } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1912                     // Failed opening this region, reassign it later
1913                     failedToOpenRegions.add(region);
1914                   } else {
1915                     LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1916                       + openingState + " in assigning region " + region);
1917                   }
1918                 }
1919               }
1920               break;
1921             } catch (IOException e) {
1922               if (e instanceof RemoteException) {
1923                 e = ((RemoteException)e).unwrapRemoteException();
1924               }
1925               if (e instanceof RegionServerStoppedException) {
1926                 LOG.warn("The region server was shut down, ", e);
1927                 // No need to retry, the region server is a goner.
1928                 return false;
1929               } else if (e instanceof ServerNotRunningYetException) {
1930                 long now = System.currentTimeMillis();
1931                 if (now < maxWaitTime) {
1932                   LOG.debug("Server is not yet up; waiting up to " +
1933                     (maxWaitTime - now) + "ms", e);
1934                   Thread.sleep(100);
1935                   i--; // reset the try count
1936                   continue;
1937                 }
1938               } else if (e instanceof java.net.SocketTimeoutException
1939                   && this.serverManager.isServerOnline(destination)) {
1940                 // In case socket is timed out and the region server is still online,
1941                 // the openRegion RPC could have been accepted by the server and
1942                 // just the response didn't go through.  So we will retry to
1943                 // open the region on the same server.
1944                 if (LOG.isDebugEnabled()) {
1945                   LOG.debug("Bulk assigner openRegion() to " + destination
1946                     + " has timed out, but the regions might"
1947                     + " already be opened on it.", e);
1948                 }
1949                 // wait and reset the re-try count, server might be just busy.
1950                 Thread.sleep(100);
1951                 i--;
1952                 continue;
1953               }
1954               throw e;
1955             }
1956           }
1957         } catch (IOException e) {
1958           // Can be a socket timeout, EOF, NoRouteToHost, etc
1959           LOG.info("Unable to communicate with " + destination
1960             + " in order to assign regions, ", e);
1961           return false;
1962         }
1963       } finally {
1964         for (Lock lock : locks.values()) {
1965           lock.unlock();
1966         }
1967       }
1968 
1969       if (!failedToOpenRegions.isEmpty()) {
1970         for (HRegionInfo region : failedToOpenRegions) {
1971           if (!regionStates.isRegionOnline(region)) {
1972             invokeAssign(region);
1973           }
1974         }
1975       }
1976 
1977       // wait for assignment completion
1978       ArrayList<HRegionInfo> userRegionSet = new ArrayList<HRegionInfo>(regions.size());
1979       for (HRegionInfo region: regions) {
1980         if (!region.getTable().isSystemTable()) {
1981           userRegionSet.add(region);
1982         }
1983       }
1984       if (!waitForAssignment(userRegionSet, true, userRegionSet.size(),
1985             System.currentTimeMillis())) {
1986         LOG.debug("some user regions are still in transition: " + userRegionSet);
1987       }
1988       LOG.debug("Bulk assigning done for " + destination);
1989       return true;
1990     } finally {
1991       metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTime() - startTime);
1992     }
1993   }
1994 
1995   /**
1996    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1997    *
1998    * The RPC will be sent only to the region sever found in the region state
1999    * if it is passed in, otherwise, to the src server specified. If region
2000    * state is not specified, we don't update region state at all, instead
2001    * we just send the RPC call. This is useful for some cleanup without
2002    * messing around the region states (see handleRegion, on region opened
2003    * on an unexpected server scenario, for an example)
2004    */
2005   private void unassign(final HRegionInfo region,
2006       final RegionState state, final int versionOfClosingNode,
2007       final ServerName dest, final boolean transitionInZK,
2008       final ServerName src) {
2009     ServerName server = src;
2010     if (state != null) {
2011       server = state.getServerName();
2012     }
2013     long maxWaitTime = -1;
2014     for (int i = 1; i <= this.maximumAttempts; i++) {
2015       if (this.server.isStopped() || this.server.isAborted()) {
2016         LOG.debug("Server stopped/aborted; skipping unassign of " + region);
2017         return;
2018       }
2019       // ClosedRegionhandler can remove the server from this.regions
2020       if (!serverManager.isServerOnline(server)) {
2021         LOG.debug("Offline " + region.getRegionNameAsString()
2022             + ", no need to unassign since it's on a dead server: " + server);
2023         if (transitionInZK) {
2024           // delete the node. if no node exists need not bother.
2025           deleteClosingOrClosedNode(region, server);
2026         }
2027         if (state != null) {
2028           regionOffline(region);
2029         }
2030         return;
2031       }
2032       long sleepTime = 0;
2033       try {
2034         // Send CLOSE RPC
2035         if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest,
2036           transitionInZK)) {
2037           LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString());
2038           if (useZKForAssignment && !transitionInZK && state != null) {
2039             // Retry to make sure the region is
2040             // closed so as to avoid double assignment.
2041             unassign(region, state, versionOfClosingNode, dest, transitionInZK, src);
2042           }
2043           return;
2044         }
2045         // This never happens. Currently regionserver close always return true.
2046         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
2047         LOG.warn("Server " + server + " region CLOSE RPC returned false for "
2048             + region.getRegionNameAsString());
2049       } catch (Throwable t) {
2050         Configuration conf = this.server.getConfiguration();
2051         if (t instanceof RemoteException) {
2052           t = ((RemoteException) t).unwrapRemoteException();
2053         }
2054         boolean logRetries = true;
2055         if (t instanceof RegionServerStoppedException
2056             || t instanceof ServerNotRunningYetException) {
2057           // RS is aborting or stopping, we cannot offline the region since the region may need
2058           // to do WAL recovery. Until we see the RS expiration, we should retry.
2059           sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
2060             RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
2061 
2062         } else if (t instanceof NotServingRegionException) {
2063           LOG.debug(
2064             "Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t);
2065           if (transitionInZK) {
2066             deleteClosingOrClosedNode(region, server);
2067           }
2068           if (state != null) {
2069             regionOffline(region);
2070           }
2071           return;
2072         } else if ((t instanceof FailedServerException)
2073             || (state != null && t instanceof RegionAlreadyInTransitionException)) {
2074           if (t instanceof FailedServerException) {
2075             sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
2076               RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
2077           } else {
2078             // RS is already processing this region, only need to update the timestamp
2079             LOG.debug("update " + state + " the timestamp.");
2080             state.updateTimestampToNow();
2081             if (maxWaitTime < 0) {
2082               maxWaitTime = EnvironmentEdgeManager.currentTime() + conf.getLong(
2083                 ALREADY_IN_TRANSITION_WAITTIME, DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
2084             }
2085             long now = EnvironmentEdgeManager.currentTime();
2086             if (now < maxWaitTime) {
2087               LOG.debug("Region is already in transition; " + "waiting up to "
2088                   + (maxWaitTime - now) + "ms",
2089                 t);
2090               sleepTime = 100;
2091               i--; // reset the try count
2092               logRetries = false;
2093             }
2094           }
2095         }
2096 
2097         if (logRetries) {
2098           LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString()
2099               + ", try=" + i + " of " + this.maximumAttempts,
2100             t);
2101           // Presume retry or server will expire.
2102         }
2103       }
2104       // sleepTime is set in one of the following cases (reasons commented above):
2105       // 1. Region server stopping or aborting
2106       // 2. Region already in transition
2107       // 3. Connecting to server that is already dead
2108       //
2109       // If sleepTime is not set by any of the cases, set it to sleep for
2110       // configured exponential backoff time
2111       if (sleepTime == 0 && i != maximumAttempts) {
2112         sleepTime = backoffPolicy.getBackoffTime(retryConfig, i);
2113         LOG.info("Waiting for " + sleepTime + "milliseconds exponential backoff time for "
2114             + region.getRegionNameAsString() + " before next retry " + (i + 1) + " of "
2115             + this.maximumAttempts);
2116       }
2117       try {
2118         if (sleepTime > 0 && i != maximumAttempts) {
2119           Thread.sleep(sleepTime);
2120         }
2121       } catch (InterruptedException ie) {
2122         LOG.warn("Failed to unassign " + region.getRegionNameAsString() + " since interrupted", ie);
2123         if (state != null) {
2124           regionStates.updateRegionState(region, State.FAILED_CLOSE);
2125         }
2126         Thread.currentThread().interrupt();
2127         return;
2128       }
2129     }
2130 
2131     // Run out of attempts
2132     if (state != null) {
2133       regionStates.updateRegionState(region, State.FAILED_CLOSE);
2134     }
2135   }
2136 
2137   /**
2138    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
2139    */
2140   private RegionState forceRegionStateToOffline(
2141       final HRegionInfo region, final boolean forceNewPlan) {
2142     RegionState state = regionStates.getRegionState(region);
2143     if (state == null) {
2144       LOG.warn("Assigning but not in region states: " + region);
2145       state = regionStates.createRegionState(region);
2146     }
2147 
2148     ServerName sn = state.getServerName();
2149     if (forceNewPlan && LOG.isDebugEnabled()) {
2150       LOG.debug("Force region state offline " + state);
2151     }
2152 
2153     // We need a lock on the region as we could update it
2154     Lock lock = locker.acquireLock(region.getEncodedName());
2155     try {
2156       switch (state.getState()) {
2157         case OPEN:
2158         case OPENING:
2159         case PENDING_OPEN:
2160         case CLOSING:
2161         case PENDING_CLOSE:
2162           if (!forceNewPlan) {
2163             LOG.debug("Skip assigning " + region + ", it is already " + state);
2164             return null;
2165           }
2166         case FAILED_CLOSE:
2167         case FAILED_OPEN:
2168           unassign(region, state, -1, null, false, null);
2169           state = regionStates.getRegionState(region);
2170           if (state.isFailedClose()) {
2171             // If we can't close the region, we can't re-assign
2172             // it so as to avoid possible double assignment/data loss.
2173             LOG.info("Skip assigning " + region + ", we couldn't close it: " + state);
2174             return null;
2175           }
2176         case OFFLINE:
2177           // This region could have been open on this server
2178           // for a while. If the server is dead and not processed
2179           // yet, we can move on only if the meta shows the
2180           // region is not on this server actually, or on a server
2181           // not dead, or dead and processed already.
2182           // In case not using ZK, we don't need this check because
2183           // we have the latest info in memory, and the caller
2184           // will do another round checking any way.
2185           if (useZKForAssignment && regionStates.isServerDeadAndNotProcessed(sn)
2186               && wasRegionOnDeadServerByMeta(region, sn)) {
2187             if (!regionStates.isRegionInTransition(region)) {
2188               LOG.info(
2189                 "Updating the state to " + State.OFFLINE + " to allow to be reassigned by SSH");
2190               regionStates.updateRegionState(region, State.OFFLINE);
2191             }
2192             LOG.info("Skip assigning " + region.getRegionNameAsString()
2193                 + ", it is on a dead but not processed yet server: " + sn);
2194             return null;
2195           }
2196         case CLOSED:
2197           break;
2198         default:
2199           LOG.error("Trying to assign region " + region + ", which is " + state);
2200           return null;
2201       }
2202     } finally {
2203       lock.unlock();
2204     }
2205     return state;
2206   }
2207 
2208   @SuppressWarnings("deprecation")
2209   protected boolean wasRegionOnDeadServerByMeta(
2210       final HRegionInfo region, final ServerName sn) {
2211     try {
2212       if (region.isMetaRegion()) {
2213         ServerName server = this.server.getMetaTableLocator().
2214           getMetaRegionLocation(this.server.getZooKeeper());
2215         return regionStates.isServerDeadAndNotProcessed(server);
2216       }
2217       while (!server.isStopped()) {
2218         try {
2219           this.server.getMetaTableLocator().waitMetaRegionLocation(server.getZooKeeper());
2220           Result r = MetaTableAccessor.getRegionResult(server.getConnection(),
2221             region.getRegionName());
2222           if (r == null || r.isEmpty()) return false;
2223           ServerName server = HRegionInfo.getServerName(r);
2224           return regionStates.isServerDeadAndNotProcessed(server);
2225         } catch (IOException ioe) {
2226           LOG.info("Received exception accessing hbase:meta during force assign "
2227             + region.getRegionNameAsString() + ", retrying", ioe);
2228         }
2229       }
2230     } catch (InterruptedException e) {
2231       Thread.currentThread().interrupt();
2232       LOG.info("Interrupted accessing hbase:meta", e);
2233     }
2234     // Call is interrupted or server is stopped.
2235     return regionStates.isServerDeadAndNotProcessed(sn);
2236   }
2237 
2238   /**
2239    * Caller must hold lock on the passed <code>state</code> object.
2240    * @param state
2241    * @param setOfflineInZK
2242    * @param forceNewPlan
2243    */
2244   public void assign(RegionState state,
2245       boolean setOfflineInZK, final boolean forceNewPlan) {
2246     long startTime = EnvironmentEdgeManager.currentTime();
2247     try {
2248       Configuration conf = server.getConfiguration();
2249       RegionState currentState = state;
2250       int versionOfOfflineNode = -1;
2251       RegionPlan plan = null;
2252       long maxWaitTime = -1;
2253       HRegionInfo region = state.getRegion();
2254       RegionOpeningState regionOpenState;
2255       Throwable previousException = null;
2256       for (int i = 1; i <= maximumAttempts; i++) {
2257         if (server.isStopped() || server.isAborted()) {
2258           LOG.info("Skip assigning " + region.getRegionNameAsString()
2259             + ", the server is stopped/aborted");
2260           return;
2261         }
2262 
2263         if (plan == null) { // Get a server for the region at first
2264           try {
2265             plan = getRegionPlan(region, forceNewPlan);
2266           } catch (HBaseIOException e) {
2267             LOG.warn("Failed to get region plan", e);
2268           }
2269         }
2270 
2271         if (plan == null) {
2272           LOG.warn("Unable to determine a plan to assign " + region);
2273 
2274           // For meta region, we have to keep retrying until succeeding
2275           if (region.isMetaRegion()) {
2276             if (i == maximumAttempts) {
2277               i = 0; // re-set attempt count to 0 for at least 1 retry
2278 
2279               LOG.warn("Unable to determine a plan to assign a hbase:meta region " + region +
2280                 " after maximumAttempts (" + this.maximumAttempts +
2281                 "). Reset attempts count and continue retrying.");
2282             }
2283             waitForRetryingMetaAssignment();
2284             continue;
2285           }
2286 
2287           regionStates.updateRegionState(region, State.FAILED_OPEN);
2288           return;
2289         }
2290         if (setOfflineInZK && versionOfOfflineNode == -1) {
2291           LOG.info("Setting node as OFFLINED in ZooKeeper for region " + region);
2292           // get the version of the znode after setting it to OFFLINE.
2293           // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
2294           versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
2295           if (versionOfOfflineNode != -1) {
2296             if (isDisabledorDisablingRegionInRIT(region)) {
2297               return;
2298             }
2299             // In case of assignment from EnableTableHandler table state is ENABLING. Any how
2300             // EnableTableHandler will set ENABLED after assigning all the table regions. If we
2301             // try to set to ENABLED directly then client API may think table is enabled.
2302             // When we have a case such as all the regions are added directly into hbase:meta and we call
2303             // assignRegion then we need to make the table ENABLED. Hence in such case the table
2304             // will not be in ENABLING or ENABLED state.
2305             TableName tableName = region.getTable();
2306             if (!tableStateManager.isTableState(tableName,
2307               ZooKeeperProtos.Table.State.ENABLED, ZooKeeperProtos.Table.State.ENABLING)) {
2308               LOG.debug("Setting table " + tableName + " to ENABLED state.");
2309               setEnabledTable(tableName);
2310             }
2311           }
2312         }
2313         if (setOfflineInZK && versionOfOfflineNode == -1) {
2314           LOG.info("Unable to set offline in ZooKeeper to assign " + region);
2315           // Setting offline in ZK must have been failed due to ZK racing or some
2316           // exception which may make the server to abort. If it is ZK racing,
2317           // we should retry since we already reset the region state,
2318           // existing (re)assignment will fail anyway.
2319           if (!server.isAborted()) {
2320             continue;
2321           }
2322         }
2323         LOG.info("Assigning " + region.getRegionNameAsString() +
2324             " to " + plan.getDestination());
2325         // Transition RegionState to PENDING_OPEN
2326         currentState = regionStates.updateRegionState(region,
2327           State.PENDING_OPEN, plan.getDestination());
2328 
2329         boolean needNewPlan;
2330         final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
2331             " to " + plan.getDestination();
2332         try {
2333           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
2334           if (this.shouldAssignRegionsWithFavoredNodes) {
2335             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
2336           }
2337           regionOpenState = serverManager.sendRegionOpen(
2338               plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
2339 
2340           if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
2341             // Failed opening this region, looping again on a new server.
2342             needNewPlan = true;
2343             LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
2344                 " trying to assign elsewhere instead; " +
2345                 "try=" + i + " of " + this.maximumAttempts);
2346           } else {
2347             // we're done
2348             if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
2349               processAlreadyOpenedRegion(region, plan.getDestination());
2350             }
2351             return;
2352           }
2353 
2354         } catch (Throwable t) {
2355           if (t instanceof RemoteException) {
2356             t = ((RemoteException) t).unwrapRemoteException();
2357           }
2358           previousException = t;
2359 
2360           // Should we wait a little before retrying? If the server is starting it's yes.
2361           // If the region is already in transition, it's yes as well: we want to be sure that
2362           //  the region will get opened but we don't want a double assignment.
2363           boolean hold = (t instanceof RegionAlreadyInTransitionException ||
2364               t instanceof ServerNotRunningYetException);
2365 
2366           // In case socket is timed out and the region server is still online,
2367           // the openRegion RPC could have been accepted by the server and
2368           // just the response didn't go through.  So we will retry to
2369           // open the region on the same server to avoid possible
2370           // double assignment.
2371           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
2372               && this.serverManager.isServerOnline(plan.getDestination()));
2373 
2374 
2375           if (hold) {
2376             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
2377               "try=" + i + " of " + this.maximumAttempts, t);
2378 
2379             if (maxWaitTime < 0) {
2380               if (t instanceof RegionAlreadyInTransitionException) {
2381                 maxWaitTime = EnvironmentEdgeManager.currentTime()
2382                   + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
2383                     DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
2384               } else {
2385                 maxWaitTime = EnvironmentEdgeManager.currentTime()
2386                   + this.server.getConfiguration().getLong(
2387                     "hbase.regionserver.rpc.startup.waittime", 60000);
2388               }
2389             }
2390             try {
2391               needNewPlan = false;
2392               long now = EnvironmentEdgeManager.currentTime();
2393               if (now < maxWaitTime) {
2394                 LOG.debug("Server is not yet up or region is already in transition; "
2395                   + "waiting up to " + (maxWaitTime - now) + "ms", t);
2396                 Thread.sleep(100);
2397                 i--; // reset the try count
2398               } else if (!(t instanceof RegionAlreadyInTransitionException)) {
2399                 LOG.debug("Server is not up for a while; try a new one", t);
2400                 needNewPlan = true;
2401               }
2402             } catch (InterruptedException ie) {
2403               LOG.warn("Failed to assign "
2404                   + region.getRegionNameAsString() + " since interrupted", ie);
2405               regionStates.updateRegionState(region, State.FAILED_OPEN);
2406               Thread.currentThread().interrupt();
2407               return;
2408             }
2409           } else if (retry) {
2410             needNewPlan = false;
2411             i--; // we want to retry as many times as needed as long as the RS is not dead.
2412             LOG.warn(assignMsg + ", trying to assign to the same region server due ", t);
2413           } else {
2414             needNewPlan = true;
2415             LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
2416                 " try=" + i + " of " + this.maximumAttempts, t);
2417           }
2418         }
2419 
2420         if (i == this.maximumAttempts) {
2421           // For meta region, we have to keep retrying until succeeding
2422           if (region.isMetaRegion()) {
2423             i = 0; // re-set attempt count to 0 for at least 1 retry
2424             LOG.warn(assignMsg +
2425                 ", trying to assign a hbase:meta region reached to maximumAttempts (" +
2426                 this.maximumAttempts + ").  Reset attempt counts and continue retrying.");
2427             waitForRetryingMetaAssignment();
2428           }
2429           else {
2430             // Don't reset the region state or get a new plan any more.
2431             // This is the last try.
2432             continue;
2433           }
2434         }
2435 
2436         // If region opened on destination of present plan, reassigning to new
2437         // RS may cause double assignments. In case of RegionAlreadyInTransitionException
2438         // reassigning to same RS.
2439         if (needNewPlan) {
2440           // Force a new plan and reassign. Will return null if no servers.
2441           // The new plan could be the same as the existing plan since we don't
2442           // exclude the server of the original plan, which should not be
2443           // excluded since it could be the only server up now.
2444           RegionPlan newPlan = null;
2445           try {
2446             newPlan = getRegionPlan(region, true);
2447           } catch (HBaseIOException e) {
2448             LOG.warn("Failed to get region plan", e);
2449           }
2450           if (newPlan == null) {
2451             regionStates.updateRegionState(region, State.FAILED_OPEN);
2452             LOG.warn("Unable to find a viable location to assign region " +
2453                 region.getRegionNameAsString());
2454             return;
2455           }
2456 
2457           if (!plan.equals(newPlan) &&
2458                 !plan.getDestination().equals(newPlan.getDestination())) {
2459             // Clean out plan we failed execute and one that doesn't look like it'll
2460             // succeed anyways; we need a new plan!
2461             // Transition back to OFFLINE
2462             LOG.info("Region assignment plan changed from " + plan.getDestination() + " to "
2463                 + newPlan.getDestination() + " server.");
2464             currentState = regionStates.updateRegionState(region, State.OFFLINE);
2465             versionOfOfflineNode = -1;
2466             if (useZKForAssignment) {
2467               setOfflineInZK = true;
2468             }
2469             plan = newPlan;
2470           } else if(plan.getDestination().equals(newPlan.getDestination()) &&
2471               previousException instanceof FailedServerException) {
2472             try {
2473               LOG.info("Trying to re-assign " + region.getRegionNameAsString() +
2474                 " to the same failed server.");
2475               Thread.sleep(1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
2476                 RpcClient.FAILED_SERVER_EXPIRY_DEFAULT));
2477             } catch (InterruptedException ie) {
2478               LOG.warn("Failed to assign "
2479                   + region.getRegionNameAsString() + " since interrupted", ie);
2480               regionStates.updateRegionState(region, State.FAILED_OPEN);
2481               Thread.currentThread().interrupt();
2482               return;
2483             }
2484           }
2485         }
2486       }
2487       // Run out of attempts
2488       regionStates.updateRegionState(region, State.FAILED_OPEN);
2489     } finally {
2490       metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTime() - startTime);
2491     }
2492   }
2493 
2494   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2495     // Remove region from in-memory transition and unassigned node from ZK
2496     // While trying to enable the table the regions of the table were
2497     // already enabled.
2498     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2499       + " to " + sn);
2500     String encodedName = region.getEncodedName();
2501 
2502     //If use ZkForAssignment, region already Opened event should not be handled,
2503     //leave it to zk event. See HBase-14407.
2504     if(useZKForAssignment){
2505       String node = ZKAssign.getNodeName(watcher, encodedName);
2506       Stat stat = new Stat();
2507       try {
2508         byte[] existingBytes = ZKUtil.getDataNoWatch(watcher, node, stat);
2509         if(existingBytes!=null){
2510           RegionTransition rt= RegionTransition.parseFrom(existingBytes);
2511           EventType et = rt.getEventType();
2512           if (et.equals(EventType.RS_ZK_REGION_OPENED)) {
2513             LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2514               + " and node in "+et+" state");
2515             return;
2516           }
2517         }
2518       } catch (KeeperException ke) {
2519         LOG.warn("Unexpected ZK exception getData " + node
2520           + " node for the region " + encodedName, ke);
2521       } catch (DeserializationException e) {
2522         LOG.warn("Get RegionTransition from zk deserialization failed! ", e);
2523       }
2524 
2525       deleteNodeInStates(encodedName, "offline", sn, EventType.M_ZK_REGION_OFFLINE);
2526     }
2527 
2528     regionStates.regionOnline(region, sn);
2529   }
2530 
2531   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2532     if (this.tableStateManager.isTableState(region.getTable(),
2533         ZooKeeperProtos.Table.State.DISABLED,
2534         ZooKeeperProtos.Table.State.DISABLING) || replicasToClose.contains(region)) {
2535       LOG.info("Table " + region.getTable() + " is disabled or disabling;"
2536         + " skipping assign of " + region.getRegionNameAsString());
2537       offlineDisabledRegion(region);
2538       return true;
2539     }
2540     return false;
2541   }
2542 
2543   /**
2544    * Set region as OFFLINED up in zookeeper
2545    *
2546    * @param state
2547    * @return the version of the offline node if setting of the OFFLINE node was
2548    *         successful, -1 otherwise.
2549    */
2550   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2551     if (!state.isClosed() && !state.isOffline()) {
2552       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2553       this.server.abort(msg, new IllegalStateException(msg));
2554       return -1;
2555     }
2556     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
2557     int versionOfOfflineNode;
2558     try {
2559       // get the version after setting the znode to OFFLINE
2560       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2561         state.getRegion(), destination);
2562       if (versionOfOfflineNode == -1) {
2563         LOG.warn("Attempted to create/force node into OFFLINE state before "
2564             + "completing assignment but failed to do so for " + state);
2565         return -1;
2566       }
2567     } catch (KeeperException e) {
2568       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2569       return -1;
2570     }
2571     return versionOfOfflineNode;
2572   }
2573 
2574   /**
2575    * For a given cluster with mixed versions of servers, get a list of
2576    * servers with lower versions, where system table regions should not be
2577    * assigned to.
2578    * For system table, we must assign regions to a server with highest version.
2579    * However, we can disable this exclusion using config:
2580    * "hbase.min.version.move.system.tables" if checkForMinVersion is true.
2581    * Detailed explanation available with definition of minVersionToMoveSysTables.
2582    *
2583    * @return List of Excluded servers for System table regions.
2584    */
2585   public List<ServerName> getExcludedServersForSystemTable() {
2586     List<Pair<ServerName, String>> serverList = new ArrayList<>();
2587     for (ServerName s : serverManager.getOnlineServersList()) {
2588       serverList.add(new Pair<>(s, server.getRegionServerVersion(s)));
2589     }
2590     if (serverList.isEmpty()) {
2591       return Collections.emptyList();
2592     }
2593     String highestVersion = Collections.max(serverList,
2594         new Comparator<Pair<ServerName, String>>() {
2595       @Override
2596       public int compare(Pair<ServerName, String> o1, Pair<ServerName, String> o2) {
2597         return VersionInfo.compareVersion(o1.getSecond(), o2.getSecond());
2598       }
2599     }).getSecond();
2600     if (!DEFAULT_MIN_VERSION_MOVE_SYS_TABLES_CONFIG.equals(minVersionToMoveSysTables)) {
2601       int comparedValue = VersionInfo.compareVersion(minVersionToMoveSysTables,
2602           highestVersion);
2603       if (comparedValue > 0) {
2604         return Collections.emptyList();
2605       }
2606     }
2607     List<ServerName> res = new ArrayList<>();
2608     for (Pair<ServerName, String> pair : serverList) {
2609       if (!pair.getSecond().equals(highestVersion)) {
2610         res.add(pair.getFirst());
2611       }
2612     }
2613     return res;
2614   }
2615 
2616   /**
2617    * @param region the region to assign
2618    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2619    * if no servers to assign, it returns null).
2620    */
2621   private RegionPlan getRegionPlan(final HRegionInfo region,
2622       final boolean forceNewPlan)  throws HBaseIOException {
2623     return getRegionPlan(region, null, forceNewPlan);
2624   }
2625 
2626   /**
2627    * @param region the region to assign
2628    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2629    * all servers are thought to be assignable.
2630    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2631    * will be generated.
2632    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2633    * if no servers to assign, it returns null).
2634    */
2635   private RegionPlan getRegionPlan(final HRegionInfo region,
2636       final ServerName serverToExclude, final boolean forceNewPlan) {
2637     // Pickup existing plan or make a new one
2638     final String encodedName = region.getEncodedName();
2639     List<ServerName> exclude = new ArrayList<>();
2640     if (region.isSystemTable()) {
2641       exclude.addAll(getExcludedServersForSystemTable());
2642     }
2643     if (serverToExclude !=null) {
2644       exclude.add(serverToExclude);
2645     }
2646     final List<ServerName> destServers =
2647       serverManager.createDestinationServersList(exclude);
2648 
2649     if (destServers.isEmpty()){
2650       LOG.warn("Can't move " + encodedName +
2651         ", there is no destination server available.");
2652       return null;
2653     }
2654 
2655     RegionPlan randomPlan = null;
2656     boolean newPlan = false;
2657     RegionPlan existingPlan;
2658 
2659     synchronized (this.regionPlans) {
2660       existingPlan = this.regionPlans.get(encodedName);
2661 
2662       if (existingPlan != null && existingPlan.getDestination() != null) {
2663         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2664           + " destination server is " + existingPlan.getDestination() +
2665             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2666       }
2667 
2668       if (forceNewPlan
2669           || existingPlan == null
2670           || existingPlan.getDestination() == null
2671           || !destServers.contains(existingPlan.getDestination())) {
2672         newPlan = true;
2673       }
2674     }
2675 
2676     if (newPlan) {
2677       ServerName destination = null;
2678       try {
2679         destination = balancer.randomAssignment(region, destServers);
2680       } catch (HBaseIOException e) {
2681         LOG.warn(e);
2682       }
2683       if (destination == null) {
2684         LOG.warn("Can't find a destination for " + encodedName);
2685         return null;
2686       }
2687       synchronized (this.regionPlans) {
2688         randomPlan = new RegionPlan(region, null, destination);
2689         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2690           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2691           regions.add(region);
2692           try {
2693             processFavoredNodes(regions);
2694           } catch (IOException ie) {
2695             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2696           }
2697         }
2698         this.regionPlans.put(encodedName, randomPlan);
2699       }
2700       LOG.debug("No previous transition plan found (or ignoring " + "an existing plan) for "
2701           + region.getRegionNameAsString() + "; generated random plan=" + randomPlan + "; "
2702           + destServers.size() + " (online=" + serverManager.getOnlineServers().size()
2703           + ") available servers, forceNewPlan=" + forceNewPlan);
2704       return randomPlan;
2705     }
2706     LOG.debug("Using pre-existing plan for " +
2707       region.getRegionNameAsString() + "; plan=" + existingPlan);
2708     return existingPlan;
2709   }
2710 
2711   /**
2712    * Wait for some time before retrying meta table region assignment
2713    */
2714   private void waitForRetryingMetaAssignment() {
2715     try {
2716       Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
2717     } catch (InterruptedException e) {
2718       LOG.error("Got exception while waiting for hbase:meta assignment");
2719       Thread.currentThread().interrupt();
2720     }
2721   }
2722 
2723   /**
2724    * Start a new thread to check if there are region servers whose versions are higher than others.
2725    * If so, move all system table regions to RS with the highest version to keep compatibility.
2726    * The reason is, RS in new version may not be able to access RS in old version when there are
2727    * some incompatible changes.
2728    */
2729   public void checkIfShouldMoveSystemRegionAsync() {
2730     new Thread(new Runnable() {
2731       @Override
2732       public void run() {
2733         try {
2734           synchronized (checkIfShouldMoveSystemRegionLock) {
2735             // RS register on ZK after reports startup on master
2736             List<HRegionInfo> regionsShouldMove = new ArrayList<>();
2737             for (ServerName server : getExcludedServersForSystemTable()) {
2738               regionsShouldMove.addAll(getCarryingSystemTables(server));
2739             }
2740             if (!regionsShouldMove.isEmpty()) {
2741               List<RegionPlan> plans = new ArrayList<>();
2742               for (HRegionInfo regionInfo : regionsShouldMove) {
2743                 RegionPlan plan = getRegionPlan(regionInfo, true);
2744                 if (regionInfo.isMetaRegion()) {
2745                   // Must move meta region first.
2746                   balance(plan);
2747                 } else {
2748                   plans.add(plan);
2749                 }
2750               }
2751               for (RegionPlan plan : plans) {
2752                 balance(plan);
2753               }
2754             }
2755           }
2756         } catch (Throwable t) {
2757           LOG.error(t);
2758         }
2759       }
2760     }).start();
2761   }
2762 
2763 
2764   /**
2765    * Unassigns the specified region.
2766    * <p>
2767    * Updates the RegionState and sends the CLOSE RPC unless region is being
2768    * split by regionserver; then the unassign fails (silently) because we
2769    * presume the region being unassigned no longer exists (its been split out
2770    * of existence). TODO: What to do if split fails and is rolled back and
2771    * parent is revivified?
2772    * <p>
2773    * If a RegionPlan is already set, it will remain.
2774    *
2775    * @param region server to be unassigned
2776    */
2777   public void unassign(HRegionInfo region) {
2778     unassign(region, false);
2779   }
2780 
2781 
2782   /**
2783    * Unassigns the specified region.
2784    * <p>
2785    * Updates the RegionState and sends the CLOSE RPC unless region is being
2786    * split by regionserver; then the unassign fails (silently) because we
2787    * presume the region being unassigned no longer exists (its been split out
2788    * of existence). TODO: What to do if split fails and is rolled back and
2789    * parent is revivified?
2790    * <p>
2791    * If a RegionPlan is already set, it will remain.
2792    *
2793    * @param region server to be unassigned
2794    * @param force if region should be closed even if already closing
2795    */
2796   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2797     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2798     LOG.debug("Starting unassign of " + region.getRegionNameAsString()
2799       + " (offlining), current state: " + regionStates.getRegionState(region));
2800 
2801     String encodedName = region.getEncodedName();
2802     // Grab the state of this region and synchronize on it
2803     int versionOfClosingNode = -1;
2804     // We need a lock here as we're going to do a put later and we don't want multiple states
2805     //  creation
2806     ReentrantLock lock = locker.acquireLock(encodedName);
2807     RegionState state = regionStates.getRegionTransitionState(encodedName);
2808     boolean reassign = true;
2809     try {
2810       if (state == null) {
2811         // Region is not in transition.
2812         // We can unassign it only if it's not SPLIT/MERGED.
2813         state = regionStates.getRegionState(encodedName);
2814         if (state != null && state.isUnassignable()) {
2815           LOG.info("Attempting to unassign " + state + ", ignored");
2816           // Offline region will be reassigned below
2817           return;
2818         }
2819         // Create the znode in CLOSING state
2820         try {
2821           if (state == null || state.getServerName() == null) {
2822             // We don't know where the region is, offline it.
2823             // No need to send CLOSE RPC
2824             LOG.warn("Attempting to unassign a region not in RegionStates "
2825               + region.getRegionNameAsString() + ", offlined");
2826             regionOffline(region);
2827             return;
2828           }
2829           if (useZKForAssignment) {
2830             versionOfClosingNode = ZKAssign.createNodeClosing(
2831               watcher, region, state.getServerName());
2832             if (versionOfClosingNode == -1) {
2833               LOG.info("Attempting to unassign " +
2834                 region.getRegionNameAsString() + " but ZK closing node "
2835                 + "can't be created.");
2836               reassign = false; // not unassigned at all
2837               return;
2838             }
2839           }
2840         } catch (KeeperException e) {
2841           if (e instanceof NodeExistsException) {
2842             // Handle race between master initiated close and regionserver
2843             // orchestrated splitting. See if existing node is in a
2844             // SPLITTING or SPLIT state.  If so, the regionserver started
2845             // an op on node before we could get our CLOSING in.  Deal.
2846             NodeExistsException nee = (NodeExistsException)e;
2847             String path = nee.getPath();
2848             try {
2849               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2850                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2851                   "skipping unassign because region no longer exists -- its split or merge");
2852                 reassign = false; // no need to reassign for split/merged region
2853                 return;
2854               }
2855             } catch (KeeperException.NoNodeException ke) {
2856               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2857                 "; presuming split and that the region to unassign, " +
2858                 encodedName + ", no longer exists -- confirm", ke);
2859               return;
2860             } catch (KeeperException ke) {
2861               LOG.error("Unexpected zk state", ke);
2862             } catch (DeserializationException de) {
2863               LOG.error("Failed parse", de);
2864             }
2865           }
2866           // If we get here, don't understand whats going on -- abort.
2867           server.abort("Unexpected ZK exception creating node CLOSING", e);
2868           reassign = false; // heading out already
2869           return;
2870         }
2871         state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2872       } else if (state.isFailedOpen()) {
2873         // The region is not open yet
2874         regionOffline(region);
2875         return;
2876       } else if (force && state.isPendingCloseOrClosing()) {
2877         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2878           " which is already " + state.getState()  +
2879           " but forcing to send a CLOSE RPC again ");
2880         if (state.isFailedClose()) {
2881           state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2882         }
2883         state.updateTimestampToNow();
2884       } else {
2885         LOG.debug("Attempting to unassign " +
2886           region.getRegionNameAsString() + " but it is " +
2887           "already in transition (" + state.getState() + ", force=" + force + ")");
2888         return;
2889       }
2890 
2891       unassign(region, state, versionOfClosingNode, dest, useZKForAssignment, null);
2892     } finally {
2893       lock.unlock();
2894 
2895       // Region is expected to be reassigned afterwards
2896       if (!replicasToClose.contains(region) && reassign && regionStates.isRegionOffline(region)) {
2897         assign(region, true);
2898       }
2899     }
2900   }
2901 
2902   public void unassign(HRegionInfo region, boolean force){
2903      unassign(region, force, null);
2904   }
2905 
2906   /**
2907    * @param region regioninfo of znode to be deleted.
2908    */
2909   public void deleteClosingOrClosedNode(HRegionInfo region, ServerName sn) {
2910     String encodedName = region.getEncodedName();
2911     deleteNodeInStates(encodedName, "closing", sn, EventType.M_ZK_REGION_CLOSING,
2912       EventType.RS_ZK_REGION_CLOSED);
2913   }
2914 
2915   /**
2916    * @param path
2917    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2918    * @throws KeeperException Can happen if the znode went away in meantime.
2919    * @throws DeserializationException
2920    */
2921   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2922       throws KeeperException, DeserializationException {
2923     boolean result = false;
2924     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2925     // cleaned up before we can get data from it.
2926     byte [] data = ZKAssign.getData(watcher, path);
2927     if (data == null) {
2928       LOG.info("Node " + path + " is gone");
2929       return false;
2930     }
2931     RegionTransition rt = RegionTransition.parseFrom(data);
2932     switch (rt.getEventType()) {
2933     case RS_ZK_REQUEST_REGION_SPLIT:
2934     case RS_ZK_REGION_SPLIT:
2935     case RS_ZK_REGION_SPLITTING:
2936     case RS_ZK_REQUEST_REGION_MERGE:
2937     case RS_ZK_REGION_MERGED:
2938     case RS_ZK_REGION_MERGING:
2939       result = true;
2940       break;
2941     default:
2942       LOG.info("Node " + path + " is in " + rt.getEventType());
2943       break;
2944     }
2945     return result;
2946   }
2947 
2948   /**
2949    * Used by unit tests. Return the number of regions opened so far in the life
2950    * of the master. Increases by one every time the master opens a region
2951    * @return the counter value of the number of regions opened so far
2952    */
2953   public int getNumRegionsOpened() {
2954     return numRegionsOpened.get();
2955   }
2956 
2957   /**
2958    * Waits until the specified region has completed assignment.
2959    * <p>
2960    * If the region is already assigned, returns immediately.  Otherwise, method
2961    * blocks until the region is assigned.
2962    * @param regionInfo region to wait on assignment for
2963    * @return true if the region is assigned false otherwise.
2964    * @throws InterruptedException
2965    */
2966   public boolean waitForAssignment(HRegionInfo regionInfo)
2967       throws InterruptedException {
2968     ArrayList<HRegionInfo> regionSet = new ArrayList<HRegionInfo>(1);
2969     regionSet.add(regionInfo);
2970     return waitForAssignment(regionSet, true, Long.MAX_VALUE);
2971   }
2972 
2973   /**
2974    * Waits until the specified region has completed assignment, or the deadline is reached.
2975    */
2976   protected boolean waitForAssignment(final Collection<HRegionInfo> regionSet,
2977       final boolean waitTillAllAssigned, final int reassigningRegions,
2978       final long minEndTime) throws InterruptedException {
2979     long deadline = minEndTime + bulkPerRegionOpenTimeGuesstimate * (reassigningRegions + 1);
2980     if (deadline < 0) { // Overflow
2981       deadline = Long.MAX_VALUE; // wait forever
2982     }
2983     return waitForAssignment(regionSet, waitTillAllAssigned, deadline);
2984   }
2985 
2986   /**
2987    * Waits until the specified region has completed assignment, or the deadline is reached.
2988    * @param regionSet set of region to wait on. the set is modified and the assigned regions removed
2989    * @param waitTillAllAssigned true if we should wait all the regions to be assigned
2990    * @param deadline the timestamp after which the wait is aborted
2991    * @return true if all the regions are assigned false otherwise.
2992    * @throws InterruptedException
2993    */
2994   protected boolean waitForAssignment(final Collection<HRegionInfo> regionSet,
2995       final boolean waitTillAllAssigned, final long deadline) throws InterruptedException {
2996     // We're not synchronizing on regionsInTransition now because we don't use any iterator.
2997     while (!regionSet.isEmpty() && !server.isStopped() && deadline > System.currentTimeMillis()) {
2998       int failedOpenCount = 0;
2999       Iterator<HRegionInfo> regionInfoIterator = regionSet.iterator();
3000       while (regionInfoIterator.hasNext()) {
3001         HRegionInfo hri = regionInfoIterator.next();
3002         if (regionStates.isRegionOnline(hri) || regionStates.isRegionInState(hri,
3003             State.SPLITTING, State.SPLIT, State.MERGING, State.MERGED)) {
3004           regionInfoIterator.remove();
3005         } else if (regionStates.isRegionInState(hri, State.FAILED_OPEN)) {
3006           failedOpenCount++;
3007         }
3008       }
3009       if (!waitTillAllAssigned) {
3010         // No need to wait, let assignment going on asynchronously
3011         break;
3012       }
3013       if (!regionSet.isEmpty()) {
3014         if (failedOpenCount == regionSet.size()) {
3015           // all the regions we are waiting had an error on open.
3016           break;
3017         }
3018         regionStates.waitForUpdate(100);
3019       }
3020     }
3021     return regionSet.isEmpty();
3022   }
3023 
3024   /**
3025    * Assigns the hbase:meta region or a replica.
3026    * <p>
3027    * Assumes that hbase:meta is currently closed and is not being actively served by
3028    * any RegionServer.
3029    * <p>
3030    * Forcibly unsets the current meta region location in ZooKeeper and assigns
3031    * hbase:meta to a random RegionServer.
3032    * @param hri TODO
3033    * @throws KeeperException
3034    */
3035   public void assignMeta(HRegionInfo hri) throws KeeperException {
3036     this.server.getMetaTableLocator().deleteMetaLocation(this.watcher, hri.getReplicaId());
3037     assign(hri, true);
3038   }
3039 
3040   /**
3041    * Assigns specified regions retaining assignments, if any.
3042    * <p>
3043    * This is a synchronous call and will return once every region has been
3044    * assigned.  If anything fails, an exception is thrown
3045    * @throws InterruptedException
3046    * @throws IOException
3047    */
3048   public void assign(Map<HRegionInfo, ServerName> regions)
3049         throws IOException, InterruptedException {
3050     if (regions == null || regions.isEmpty()) {
3051       return;
3052     }
3053     List<ServerName> servers = serverManager.createDestinationServersList();
3054     if (servers == null || servers.isEmpty()) {
3055       throw new IOException("Found no destination server to assign region(s)");
3056     }
3057 
3058     // Reuse existing assignment info
3059     Map<ServerName, List<HRegionInfo>> bulkPlan =
3060       balancer.retainAssignment(regions, servers);
3061     if (bulkPlan == null) {
3062       throw new IOException("Unable to determine a plan to assign region(s)");
3063     }
3064 
3065     processBogusAssignments(bulkPlan);
3066 
3067     assign(regions.size(), servers.size(),
3068       "retainAssignment=true", bulkPlan);
3069   }
3070 
3071   /**
3072    * Assigns specified regions round robin, if any.
3073    * <p>
3074    * This is a synchronous call and will return once every region has been
3075    * assigned.  If anything fails, an exception is thrown
3076    * @throws InterruptedException
3077    * @throws IOException
3078    */
3079   public void assign(List<HRegionInfo> regions)
3080         throws IOException, InterruptedException {
3081     if (regions == null || regions.isEmpty()) {
3082       return;
3083     }
3084 
3085     List<ServerName> servers = serverManager.createDestinationServersList();
3086     if (servers == null || servers.isEmpty()) {
3087       throw new IOException("Found no destination server to assign region(s)");
3088     }
3089 
3090     // Generate a round-robin bulk assignment plan
3091     Map<ServerName, List<HRegionInfo>> bulkPlan = balancer.roundRobinAssignment(regions, servers);
3092     if (bulkPlan == null) {
3093       throw new IOException("Unable to determine a plan to assign region(s)");
3094     }
3095 
3096     processBogusAssignments(bulkPlan);
3097 
3098     processFavoredNodes(regions);
3099     assign(regions.size(), servers.size(), "round-robin=true", bulkPlan);
3100   }
3101 
3102   private void assign(int regions, int totalServers,
3103       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
3104           throws InterruptedException, IOException {
3105 
3106     int servers = bulkPlan.size();
3107     if (servers == 1 || (regions < bulkAssignThresholdRegions
3108         && servers < bulkAssignThresholdServers)) {
3109 
3110       // Not use bulk assignment.  This could be more efficient in small
3111       // cluster, especially mini cluster for testing, so that tests won't time out
3112       if (LOG.isTraceEnabled()) {
3113         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
3114           " region(s) to " + servers + " server(s)");
3115       }
3116 
3117       // invoke assignment (async)
3118       ArrayList<HRegionInfo> userRegionSet = new ArrayList<HRegionInfo>(regions);
3119       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
3120         if (!assign(plan.getKey(), plan.getValue())) {
3121           for (HRegionInfo region: plan.getValue()) {
3122             if (!regionStates.isRegionOnline(region)) {
3123               invokeAssign(region);
3124               if (!region.getTable().isSystemTable()) {
3125                 userRegionSet.add(region);
3126               }
3127             }
3128           }
3129         }
3130       }
3131 
3132       // wait for assignment completion
3133       if (!waitForAssignment(userRegionSet, true, userRegionSet.size(),
3134             System.currentTimeMillis())) {
3135         LOG.debug("some user regions are still in transition: " + userRegionSet);
3136       }
3137     } else {
3138       LOG.info("Bulk assigning " + regions + " region(s) across "
3139         + totalServers + " server(s), " + message);
3140 
3141       // Use fixed count thread pool assigning.
3142       BulkAssigner ba = new GeneralBulkAssigner(
3143         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
3144       ba.bulkAssign();
3145       LOG.info("Bulk assigning done");
3146     }
3147   }
3148 
3149   /**
3150    * Assigns all user regions, if any exist.  Used during cluster startup.
3151    * <p>
3152    * This is a synchronous call and will return once every region has been
3153    * assigned.  If anything fails, an exception is thrown and the cluster
3154    * should be shutdown.
3155    * @throws InterruptedException
3156    * @throws IOException
3157    */
3158   private void assignAllUserRegions(Map<HRegionInfo, ServerName> allRegions)
3159       throws IOException, InterruptedException {
3160     if (allRegions == null || allRegions.isEmpty()) return;
3161 
3162     // Determine what type of assignment to do on startup
3163     boolean retainAssignment = server.getConfiguration().
3164       getBoolean("hbase.master.startup.retainassign", true);
3165 
3166     Set<HRegionInfo> regionsFromMetaScan = allRegions.keySet();
3167     if (retainAssignment) {
3168       assign(allRegions);
3169     } else {
3170       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(regionsFromMetaScan);
3171       assign(regions);
3172     }
3173 
3174     for (HRegionInfo hri : regionsFromMetaScan) {
3175       TableName tableName = hri.getTable();
3176       if (!tableStateManager.isTableState(tableName,
3177           ZooKeeperProtos.Table.State.ENABLED)) {
3178         setEnabledTable(tableName);
3179       }
3180     }
3181     // assign all the replicas that were not recorded in the meta
3182     assign(replicaRegionsNotRecordedInMeta(regionsFromMetaScan, server));
3183   }
3184 
3185   /**
3186    * Get a list of replica regions that are:
3187    * not recorded in meta yet. We might not have recorded the locations
3188    * for the replicas since the replicas may not have been online yet, master restarted
3189    * in the middle of assigning, ZK erased, etc.
3190    * @param regionsRecordedInMeta the list of regions we know are recorded in meta
3191    * either as a default, or, as the location of a replica
3192    * @param master
3193    * @return list of replica regions
3194    * @throws IOException
3195    */
3196   public static List<HRegionInfo> replicaRegionsNotRecordedInMeta(
3197       Set<HRegionInfo> regionsRecordedInMeta, MasterServices master)throws IOException {
3198     List<HRegionInfo> regionsNotRecordedInMeta = new ArrayList<HRegionInfo>();
3199     for (HRegionInfo hri : regionsRecordedInMeta) {
3200       TableName table = hri.getTable();
3201       HTableDescriptor htd = master.getTableDescriptors().get(table);
3202       // look at the HTD for the replica count. That's the source of truth
3203       int desiredRegionReplication = htd.getRegionReplication();
3204       for (int i = 0; i < desiredRegionReplication; i++) {
3205         HRegionInfo replica = RegionReplicaUtil.getRegionInfoForReplica(hri, i);
3206         if (regionsRecordedInMeta.contains(replica)) continue;
3207         regionsNotRecordedInMeta.add(replica);
3208       }
3209     }
3210     return regionsNotRecordedInMeta;
3211   }
3212 
3213   /**
3214    * Wait until no regions in transition.
3215    * @param timeout How long to wait.
3216    * @return True if nothing in regions in transition.
3217    * @throws InterruptedException
3218    */
3219   boolean waitUntilNoRegionsInTransition(final long timeout)
3220       throws InterruptedException {
3221     // Blocks until there are no regions in transition. It is possible that
3222     // there
3223     // are regions in transition immediately after this returns but guarantees
3224     // that if it returns without an exception that there was a period of time
3225     // with no regions in transition from the point-of-view of the in-memory
3226     // state of the Master.
3227     final long endTime = System.currentTimeMillis() + timeout;
3228 
3229     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
3230         && endTime > System.currentTimeMillis()) {
3231       regionStates.waitForUpdate(100);
3232     }
3233 
3234     return !regionStates.isRegionsInTransition();
3235   }
3236 
3237   /**
3238    * Rebuild the list of user regions and assignment information.
3239    * Updates regionstates with findings as we go through list of regions.
3240    * @return set of servers not online that hosted some regions according to a scan of hbase:meta
3241    * @throws IOException
3242    */
3243   Set<ServerName> rebuildUserRegions() throws
3244       IOException, KeeperException, CoordinatedStateException {
3245     Set<TableName> disabledOrEnablingTables = tableStateManager.getTablesInStates(
3246       ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.ENABLING);
3247 
3248     Set<TableName> disabledOrDisablingOrEnabling = tableStateManager.getTablesInStates(
3249       ZooKeeperProtos.Table.State.DISABLED,
3250       ZooKeeperProtos.Table.State.DISABLING,
3251       ZooKeeperProtos.Table.State.ENABLING);
3252 
3253     // Region assignment from META
3254     List<Result> results = MetaTableAccessor.fullScanOfMeta(server.getConnection());
3255     // Get any new but slow to checkin region server that joined the cluster
3256     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
3257     // Set of offline servers to be returned
3258     Set<ServerName> offlineServers = new HashSet<ServerName>();
3259     // Iterate regions in META
3260     for (Result result : results) {
3261       if (result == null && LOG.isDebugEnabled()){
3262         LOG.debug("null result from meta - ignoring but this is strange.");
3263         continue;
3264       }
3265       // keep a track of replicas to close. These were the replicas of the originally
3266       // unmerged regions. The master might have closed them before but it mightn't
3267       // maybe because it crashed.
3268       PairOfSameType<HRegionInfo> p = MetaTableAccessor.getMergeRegions(result);
3269       if (p.getFirst() != null && p.getSecond() != null) {
3270         HTableDescriptor desc = server.getTableDescriptors().get(p.getFirst().getTable());
3271         if (desc != null) {
3272           int numReplicas = desc.getRegionReplication();
3273           for (HRegionInfo merge : p) {
3274             for (int i = 1; i < numReplicas; i++) {
3275               replicasToClose.add(RegionReplicaUtil.getRegionInfoForReplica(merge, i));
3276             }
3277           }
3278         } else {
3279           LOG.warn("Found no table descriptor on filesystem for " + p.getFirst().getTable());
3280         }
3281       }
3282       RegionLocations rl =  MetaTableAccessor.getRegionLocations(result);
3283       if (rl == null) continue;
3284       HRegionLocation[] locations = rl.getRegionLocations();
3285       if (locations == null) continue;
3286       for (HRegionLocation hrl : locations) {
3287         if (hrl == null) continue;
3288         HRegionInfo regionInfo = hrl.getRegionInfo();
3289         if (regionInfo == null) continue;
3290         int replicaId = regionInfo.getReplicaId();
3291         State state = RegionStateStore.getRegionState(result, replicaId);
3292         // keep a track of replicas to close. These were the replicas of the split parents
3293         // from the previous life of the master. The master should have closed them before
3294         // but it couldn't maybe because it crashed
3295         if (replicaId == 0 && state.equals(State.SPLIT)) {
3296           for (HRegionLocation h : locations) {
3297             replicasToClose.add(h.getRegionInfo());
3298           }
3299         }
3300         ServerName lastHost = hrl.getServerName();
3301         ServerName regionLocation = RegionStateStore.getRegionServer(result, replicaId);
3302         if (tableStateManager.isTableState(regionInfo.getTable(),
3303              ZooKeeperProtos.Table.State.DISABLED)) {
3304           // force region to forget it hosts for disabled/disabling tables.
3305           // see HBASE-13326
3306           lastHost = null;
3307           regionLocation = null;
3308         }
3309         regionStates.createRegionState(regionInfo, state, regionLocation, lastHost);
3310         if (!regionStates.isRegionInState(regionInfo, State.OPEN)) {
3311           // Region is not open (either offline or in transition), skip
3312           continue;
3313         }
3314         TableName tableName = regionInfo.getTable();
3315         if (!onlineServers.contains(regionLocation)) {
3316           // Region is located on a server that isn't online
3317           offlineServers.add(regionLocation);
3318           if (useZKForAssignment) {
3319             regionStates.regionOffline(regionInfo);
3320           }
3321         } else if (!disabledOrEnablingTables.contains(tableName)) {
3322           // Region is being served and on an active server
3323           // add only if region not in disabled or enabling table
3324           regionStates.regionOnline(regionInfo, regionLocation);
3325           balancer.regionOnline(regionInfo, regionLocation);
3326         } else if (useZKForAssignment) {
3327           regionStates.regionOffline(regionInfo);
3328         }
3329         // need to enable the table if not disabled or disabling or enabling
3330         // this will be used in rolling restarts
3331         if (!disabledOrDisablingOrEnabling.contains(tableName)
3332           && !getTableStateManager().isTableState(tableName,
3333             ZooKeeperProtos.Table.State.ENABLED)) {
3334           setEnabledTable(tableName);
3335         }
3336       }
3337     }
3338     return offlineServers;
3339   }
3340 
3341   /**
3342    * Recover the tables that were not fully moved to DISABLED state. These
3343    * tables are in DISABLING state when the master restarted/switched.
3344    *
3345    * @throws KeeperException
3346    * @throws TableNotFoundException
3347    * @throws IOException
3348    */
3349   private void recoverTableInDisablingState()
3350       throws KeeperException, IOException, CoordinatedStateException {
3351     Set<TableName> disablingTables =
3352       tableStateManager.getTablesInStates(ZooKeeperProtos.Table.State.DISABLING);
3353     if (disablingTables.size() != 0) {
3354       for (TableName tableName : disablingTables) {
3355         // Recover by calling DisableTableHandler
3356         LOG.info("The table " + tableName
3357             + " is in DISABLING state.  Hence recovering by moving the table"
3358             + " to DISABLED state.");
3359         new DisableTableHandler(this.server, tableName,
3360             this, tableLockManager, true).prepare().process();
3361       }
3362     }
3363   }
3364 
3365   /**
3366    * Recover the tables that are not fully moved to ENABLED state. These tables
3367    * are in ENABLING state when the master restarted/switched
3368    *
3369    * @throws KeeperException
3370    * @throws org.apache.hadoop.hbase.TableNotFoundException
3371    * @throws IOException
3372    */
3373   private void recoverTableInEnablingState()
3374       throws KeeperException, IOException, CoordinatedStateException {
3375     Set<TableName> enablingTables = tableStateManager.
3376       getTablesInStates(ZooKeeperProtos.Table.State.ENABLING);
3377     if (enablingTables.size() != 0) {
3378       for (TableName tableName : enablingTables) {
3379         // Recover by calling EnableTableHandler
3380         LOG.info("The table " + tableName
3381             + " is in ENABLING state.  Hence recovering by moving the table"
3382             + " to ENABLED state.");
3383         // enableTable in sync way during master startup,
3384         // no need to invoke coprocessor
3385         EnableTableHandler eth = new EnableTableHandler(this.server, tableName,
3386           this, tableLockManager, true);
3387         try {
3388           eth.prepare();
3389         } catch (TableNotFoundException e) {
3390           LOG.warn("Table " + tableName + " not found in hbase:meta to recover.");
3391           continue;
3392         }
3393         eth.process();
3394       }
3395     }
3396   }
3397 
3398   /**
3399    * Processes list of dead servers from result of hbase:meta scan and regions in RIT.
3400    * This is used for failover to recover the lost regions that belonged to
3401    * RegionServers which failed while there was no active master or are offline for whatever
3402    * reason and for regions that were in RIT.
3403    *
3404    * @param deadServers
3405    *          The list of dead servers which failed while there was no active master. Can be null.
3406    * @throws IOException
3407    * @throws KeeperException
3408    */
3409   private void processDeadServersAndRecoverLostRegions(Set<ServerName> deadServers)
3410   throws IOException, KeeperException {
3411     if (deadServers != null && !deadServers.isEmpty()) {
3412       for (ServerName serverName: deadServers) {
3413         if (!serverManager.isServerDead(serverName)) {
3414           serverManager.expireServer(serverName); // Let SSH do region re-assign
3415         }
3416       }
3417     }
3418 
3419     List<String> nodes = useZKForAssignment ?
3420       ZKUtil.listChildrenAndWatchForNewChildren(watcher, watcher.assignmentZNode)
3421       : ZKUtil.listChildrenNoWatch(watcher, watcher.assignmentZNode);
3422     if (nodes != null && !nodes.isEmpty()) {
3423       for (String encodedRegionName : nodes) {
3424         processRegionInTransition(encodedRegionName, null);
3425       }
3426     } else if (!useZKForAssignment) {
3427       processRegionInTransitionZkLess();
3428     }
3429   }
3430 
3431   void processRegionInTransitionZkLess() {
3432     // We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions
3433     // in case the RPC call is not sent out yet before the master was shut down
3434     // since we update the state before we send the RPC call. We can't update
3435     // the state after the RPC call. Otherwise, we don't know what's happened
3436     // to the region if the master dies right after the RPC call is out.
3437     Set<RegionState> rits = regionStates.getRegionsInTransition();
3438     for (RegionState regionState : rits) {
3439       LOG.info("Processing " + regionState);
3440       ServerName serverName = regionState.getServerName();
3441       // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that
3442       // case, try assigning it here.
3443       if (serverName != null
3444           && !serverManager.getOnlineServers().containsKey(serverName)) {
3445         LOG.info("Server " + serverName + " isn't online. SSH will handle this");
3446         continue;
3447       }
3448       HRegionInfo regionInfo = regionState.getRegion();
3449       State state = regionState.getState();
3450 
3451       switch (state) {
3452       case CLOSED:
3453         invokeAssign(regionInfo);
3454         break;
3455       case PENDING_OPEN:
3456         retrySendRegionOpen(regionState);
3457         break;
3458       case PENDING_CLOSE:
3459         retrySendRegionClose(regionState);
3460         break;
3461       case FAILED_CLOSE:
3462       case FAILED_OPEN:
3463         invokeUnAssign(regionInfo);
3464         break;
3465       default:
3466         // No process for other states
3467       }
3468     }
3469   }
3470 
3471   /**
3472    * At master failover, for pending_open region, make sure
3473    * sendRegionOpen RPC call is sent to the target regionserver
3474    */
3475   private void retrySendRegionOpen(final RegionState regionState) {
3476     this.executorService.submit(
3477       new EventHandler(server, EventType.M_MASTER_RECOVERY) {
3478         @Override
3479         public void process() throws IOException {
3480           HRegionInfo hri = regionState.getRegion();
3481           ServerName serverName = regionState.getServerName();
3482           ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
3483           try {
3484             for (int i = 1; i <= maximumAttempts; i++) {
3485               if (!serverManager.isServerOnline(serverName)
3486                   || server.isStopped() || server.isAborted()) {
3487                 return; // No need any more
3488               }
3489               try {
3490                 if (!regionState.equals(regionStates.getRegionState(hri))) {
3491                   return; // Region is not in the expected state any more
3492                 }
3493                 List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
3494                 if (shouldAssignRegionsWithFavoredNodes) {
3495                   favoredNodes = ((FavoredNodeLoadBalancer)balancer).getFavoredNodes(hri);
3496                 }
3497                 RegionOpeningState regionOpenState = serverManager.sendRegionOpen(
3498                   serverName, hri, -1, favoredNodes);
3499 
3500                 if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
3501                   // Failed opening this region, this means the target server didn't get
3502                   // the original region open RPC, so re-assign it with a new plan
3503                   LOG.debug("Got failed_opening in retry sendRegionOpen for "
3504                     + regionState + ", re-assign it");
3505                   invokeAssign(hri, true);
3506                 }
3507                 return; // Done.
3508               } catch (Throwable t) {
3509                 if (t instanceof RemoteException) {
3510                   t = ((RemoteException) t).unwrapRemoteException();
3511                 }
3512                 // In case SocketTimeoutException/FailedServerException, retry
3513                 if (t instanceof java.net.SocketTimeoutException
3514                     || t instanceof FailedServerException) {
3515                   Threads.sleep(100);
3516                   continue;
3517                 }
3518                 // For other exceptions, re-assign it
3519                 LOG.debug("Got exception in retry sendRegionOpen for "
3520                   + regionState + ", re-assign it", t);
3521                 invokeAssign(hri);
3522                 return; // Done.
3523               }
3524             }
3525           } finally {
3526             lock.unlock();
3527           }
3528         }
3529       });
3530   }
3531 
3532   /**
3533    * At master failover, for pending_close region, make sure
3534    * sendRegionClose RPC call is sent to the target regionserver
3535    */
3536   private void retrySendRegionClose(final RegionState regionState) {
3537     this.executorService.submit(
3538       new EventHandler(server, EventType.M_MASTER_RECOVERY) {
3539         @Override
3540         public void process() throws IOException {
3541           HRegionInfo hri = regionState.getRegion();
3542           ServerName serverName = regionState.getServerName();
3543           ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
3544           try {
3545             for (int i = 1; i <= maximumAttempts; i++) {
3546               if (!serverManager.isServerOnline(serverName)
3547                   || server.isStopped() || server.isAborted()) {
3548                 return; // No need any more
3549               }
3550               try {
3551                 if (!regionState.equals(regionStates.getRegionState(hri))) {
3552                   return; // Region is not in the expected state any more
3553                 }
3554                 if (!serverManager.sendRegionClose(serverName, hri, -1, null, false)) {
3555                   // This means the region is still on the target server
3556                   LOG.debug("Got false in retry sendRegionClose for "
3557                     + regionState + ", re-close it");
3558                   invokeUnAssign(hri);
3559                 }
3560                 return; // Done.
3561               } catch (Throwable t) {
3562                 if (t instanceof RemoteException) {
3563                   t = ((RemoteException) t).unwrapRemoteException();
3564                 }
3565                 // In case SocketTimeoutException/FailedServerException, retry
3566                 if (t instanceof java.net.SocketTimeoutException
3567                     || t instanceof FailedServerException) {
3568                   Threads.sleep(100);
3569                   continue;
3570                 }
3571                 if (!(t instanceof NotServingRegionException
3572                     || t instanceof RegionAlreadyInTransitionException)) {
3573                   // NotServingRegionException/RegionAlreadyInTransitionException
3574                   // means the target server got the original region close request.
3575                   // For other exceptions, re-close it
3576                   LOG.debug("Got exception in retry sendRegionClose for "
3577                     + regionState + ", re-close it", t);
3578                   invokeUnAssign(hri);
3579                 }
3580                 return; // Done.
3581               }
3582             }
3583           } finally {
3584             lock.unlock();
3585           }
3586         }
3587       });
3588   }
3589 
3590   /**
3591    * Set Regions in transitions metrics.
3592    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
3593    * This iterator is not fail fast, which may lead to stale read; but that's better than
3594    * creating a copy of the map for metrics computation, as this method will be invoked
3595    * on a frequent interval.
3596    */
3597   public void updateRegionsInTransitionMetrics() {
3598     long currentTime = System.currentTimeMillis();
3599     int totalRITs = 0;
3600     int totalRITsOverThreshold = 0;
3601     long oldestRITTime = 0;
3602     Map<String, RegionState> ritsOverThreshold = null;
3603     int ritThreshold = this.server.getConfiguration().
3604       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
3605     for (RegionState state: regionStates.getRegionsInTransition()) {
3606       totalRITs++;
3607       long ritTime = currentTime - state.getStamp();
3608       if (ritTime > ritThreshold) { // more than the threshold
3609         totalRITsOverThreshold++;
3610         if (ritsOverThreshold == null) {
3611           ritsOverThreshold = new HashMap<>();
3612         }
3613         ritsOverThreshold.put(state.getRegion().getEncodedName(), state);
3614       }
3615       if (oldestRITTime < ritTime) {
3616         oldestRITTime = ritTime;
3617       }
3618     }
3619     if (LOG.isDebugEnabled() && ritsOverThreshold != null && !ritsOverThreshold.isEmpty()) {
3620       StringBuilder sb = new StringBuilder();
3621       for (Map.Entry<String, RegionState> rit: ritsOverThreshold.entrySet()) {
3622         sb.append(rit.getKey()).append(":")
3623           .append(rit.getValue().getState().name()).append("\n");
3624       }
3625       sb.delete(sb.length()-1, sb.length());
3626       LOG.debug("RITs over threshold: " + sb.toString());
3627     }
3628     if (this.metricsAssignmentManager != null) {
3629       this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime);
3630       this.metricsAssignmentManager.updateRITCount(totalRITs);
3631       this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold);
3632     }
3633   }
3634 
3635   /**
3636    * @param region Region whose plan we are to clear.
3637    */
3638   void clearRegionPlan(final HRegionInfo region) {
3639     synchronized (this.regionPlans) {
3640       this.regionPlans.remove(region.getEncodedName());
3641     }
3642   }
3643 
3644   /**
3645    * Wait on region to clear regions-in-transition.
3646    * @param hri Region to wait on.
3647    * @throws IOException
3648    */
3649   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
3650       throws IOException, InterruptedException {
3651     waitOnRegionToClearRegionsInTransition(hri, -1L);
3652   }
3653 
3654   /**
3655    * Wait on region to clear regions-in-transition or time out
3656    * @param hri
3657    * @param timeOut Milliseconds to wait for current region to be out of transition state.
3658    * @return True when a region clears regions-in-transition before timeout otherwise false
3659    * @throws InterruptedException
3660    */
3661   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
3662       throws InterruptedException {
3663     if (!regionStates.isRegionInTransition(hri)) return true;
3664     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTime()
3665         + timeOut;
3666     // There is already a timeout monitor on regions in transition so I
3667     // should not have to have one here too?
3668     LOG.info("Waiting for " + hri.getEncodedName() +
3669         " to leave regions-in-transition, timeOut=" + timeOut + " ms.");
3670     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
3671       regionStates.waitForUpdate(100);
3672       if (EnvironmentEdgeManager.currentTime() > end) {
3673         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
3674         return false;
3675       }
3676     }
3677     if (this.server.isStopped()) {
3678       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
3679       return false;
3680     }
3681     return true;
3682   }
3683 
3684   void invokeAssignNow(HRegionInfo regionInfo, boolean forceNewPlan) {
3685     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo, forceNewPlan));
3686   }
3687 
3688   void invokeAssignLater(HRegionInfo regionInfo, boolean forceNewPlan, long sleepMillis) {
3689     scheduledThreadPoolExecutor.schedule(new DelayedAssignCallable(new AssignCallable(this,
3690             regionInfo, forceNewPlan)), sleepMillis, TimeUnit.MILLISECONDS);
3691   }
3692 
3693   public void invokeAssign(HRegionInfo regionInfo) {
3694     invokeAssign(regionInfo, true);
3695   }
3696 
3697   public void invokeAssign(HRegionInfo regionInfo, boolean forceNewPlan) {
3698     if (failedOpenTracker.containsKey(regionInfo.getEncodedName())) {
3699       // Sleep before reassigning if this region has failed to open before
3700       long sleepTime = backoffPolicy.getBackoffTime(retryConfig,
3701         getFailedAttempts(regionInfo.getEncodedName()));
3702       invokeAssignLater(regionInfo, forceNewPlan, sleepTime);
3703     } else {
3704       // Immediately reassign if this region has never failed an open before
3705       invokeAssignNow(regionInfo, forceNewPlan);
3706     }
3707   }
3708 
3709   private int getFailedAttempts(String regionName) {
3710     AtomicInteger failedCount = failedOpenTracker.get(regionName);
3711     if (failedCount != null) {
3712       return failedCount.get();
3713     } else {
3714       // If we do not have a failed open tracker for a region assume it has never failed before
3715       return 0;
3716     }
3717   }
3718 
3719   void invokeUnAssign(HRegionInfo regionInfo) {
3720     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3721   }
3722 
3723   public ServerHostRegion isCarryingMeta(ServerName serverName) {
3724     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3725   }
3726 
3727   public ServerHostRegion isCarryingMetaReplica(ServerName serverName, int replicaId) {
3728     return isCarryingRegion(serverName,
3729         RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, replicaId));
3730   }
3731 
3732   public ServerHostRegion isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) {
3733     return isCarryingRegion(serverName, metaHri);
3734   }
3735 
3736   private List<HRegionInfo> getCarryingSystemTables(ServerName serverName) {
3737     Set<HRegionInfo> regions = this.getRegionStates().getServerRegions(serverName);
3738     if (regions == null) {
3739       return new ArrayList<>();
3740     }
3741     List<HRegionInfo> list = new ArrayList<>();
3742     for (HRegionInfo region : regions) {
3743       if (region.isSystemTable()) {
3744         list.add(region);
3745       }
3746     }
3747     return list;
3748   }
3749 
3750   /**
3751    * Check if the shutdown server carries the specific region.
3752    * We have a bunch of places that store region location
3753    * Those values aren't consistent. There is a delay of notification.
3754    * The location from zookeeper unassigned node has the most recent data;
3755    * but the node could be deleted after the region is opened by AM.
3756    * The AM's info could be old when OpenedRegionHandler
3757    * processing hasn't finished yet when server shutdown occurs.
3758    * @return whether the serverName currently hosts the region
3759    */
3760   private ServerHostRegion isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3761     RegionTransition rt = null;
3762     try {
3763       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3764       // This call can legitimately come by null
3765       rt = data == null? null: RegionTransition.parseFrom(data);
3766     } catch (KeeperException e) {
3767       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3768     } catch (DeserializationException e) {
3769       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3770     }
3771 
3772     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3773     if (addressFromZK != null) {
3774       // if we get something from ZK, we will use the data
3775       boolean matchZK = addressFromZK.equals(serverName);
3776       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
3777         " current=" + serverName + ", matches=" + matchZK);
3778       return matchZK ? ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
3779     }
3780 
3781     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3782     if (LOG.isDebugEnabled()) {
3783       LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3784         " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3785         " server being checked: " + serverName);
3786     }
3787     if (addressFromAM != null) {
3788       return addressFromAM.equals(serverName) ?
3789           ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
3790     }
3791 
3792     if (hri.isMetaRegion() && RegionReplicaUtil.isDefaultReplica(hri)) {
3793       // For the Meta region (default replica), we can do one more check on MetaTableLocator
3794       final ServerName serverNameInZK =
3795           server.getMetaTableLocator().getMetaRegionLocation(this.server.getZooKeeper());
3796       if (LOG.isDebugEnabled()) {
3797         LOG.debug("Based on MetaTableLocator, the META region is on server=" +
3798           (serverNameInZK == null ? "null" : serverNameInZK) +
3799           " server being checked: " + serverName);
3800       }
3801       if (serverNameInZK != null) {
3802         return serverNameInZK.equals(serverName) ?
3803             ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
3804       }
3805     }
3806 
3807     // Checked everywhere, if reaching here, we are unsure whether the server is carrying region.
3808     return ServerHostRegion.UNKNOWN;
3809   }
3810 
3811   /**
3812    * Clean out crashed server removing any assignments.
3813    * @param sn Server that went down.
3814    * @return list of regions in transition on this server
3815    */
3816   public List<HRegionInfo> cleanOutCrashedServerReferences(final ServerName sn) {
3817     // Clean out any existing assignment plans for this server
3818     synchronized (this.regionPlans) {
3819       for (Iterator <Map.Entry<String, RegionPlan>> i = this.regionPlans.entrySet().iterator();
3820           i.hasNext();) {
3821         Map.Entry<String, RegionPlan> e = i.next();
3822         ServerName otherSn = e.getValue().getDestination();
3823         // The name will be null if the region is planned for a random assign.
3824         if (otherSn != null && otherSn.equals(sn)) {
3825           // Use iterator's remove else we'll get CME
3826           i.remove();
3827         }
3828       }
3829     }
3830     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3831     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3832       HRegionInfo hri = it.next();
3833       String encodedName = hri.getEncodedName();
3834 
3835       // We need a lock on the region as we could update it
3836       Lock lock = locker.acquireLock(encodedName);
3837       try {
3838         RegionState regionState = regionStates.getRegionTransitionState(encodedName);
3839         if (regionState == null
3840             || (regionState.getServerName() != null && !regionState.isOnServer(sn))
3841             || !(regionState.isFailedClose() || regionState.isOffline()
3842               || regionState.isPendingOpenOrOpening())) {
3843           LOG.info("Skip " + regionState + " since it is not opening/failed_close"
3844             + " on the dead server any more: " + sn);
3845           it.remove();
3846         } else {
3847           try {
3848             // Delete the ZNode if exists
3849             ZKAssign.deleteNodeFailSilent(watcher, hri);
3850           } catch (KeeperException ke) {
3851             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3852           }
3853           if (tableStateManager.isTableState(hri.getTable(),
3854               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
3855             regionStates.regionOffline(hri);
3856             it.remove();
3857             continue;
3858           }
3859           // Mark the region offline and assign it again by SSH
3860           regionStates.updateRegionState(hri, State.OFFLINE);
3861         }
3862       } finally {
3863         lock.unlock();
3864       }
3865     }
3866     return regions;
3867   }
3868 
3869   /**
3870    * @param plan Plan to execute.
3871    */
3872   public void balance(final RegionPlan plan) {
3873 
3874     HRegionInfo hri = plan.getRegionInfo();
3875     TableName tableName = hri.getTable();
3876     if (tableStateManager.isTableState(tableName,
3877       ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
3878       LOG.info("Ignored moving region of disabling/disabled table "
3879         + tableName);
3880       return;
3881     }
3882 
3883     // Move the region only if it's assigned
3884     String encodedName = hri.getEncodedName();
3885     ReentrantLock lock = locker.acquireLock(encodedName);
3886     try {
3887       if (!regionStates.isRegionOnline(hri)) {
3888         RegionState state = regionStates.getRegionState(encodedName);
3889         LOG.info("Ignored moving region not assigned: " + hri + ", "
3890           + (state == null ? "not in region states" : state));
3891         return;
3892       }
3893       synchronized (this.regionPlans) {
3894         this.regionPlans.put(plan.getRegionName(), plan);
3895       }
3896       unassign(hri, false, plan.getDestination());
3897     } finally {
3898       lock.unlock();
3899     }
3900   }
3901 
3902   public void stop() {
3903     shutdown(); // Stop executor service, etc
3904   }
3905 
3906   /**
3907    * Shutdown the threadpool executor service
3908    */
3909   public void shutdown() {
3910     // It's an immediate shutdown, so we're clearing the remaining tasks.
3911     synchronized (zkEventWorkerWaitingList){
3912       zkEventWorkerWaitingList.clear();
3913     }
3914 
3915     // Shutdown the threadpool executor service
3916     threadPoolExecutorService.shutdownNow();
3917     scheduledThreadPoolExecutor.shutdownNow();
3918     zkEventWorkers.shutdownNow();
3919     regionStateStore.stop();
3920   }
3921 
3922   protected void setEnabledTable(TableName tableName) {
3923     try {
3924       this.tableStateManager.setTableState(tableName,
3925         ZooKeeperProtos.Table.State.ENABLED);
3926     } catch (CoordinatedStateException e) {
3927       // here we can abort as it is the start up flow
3928       String errorMsg = "Unable to ensure that the table " + tableName
3929           + " will be" + " enabled because of a ZooKeeper issue";
3930       LOG.error(errorMsg);
3931       this.server.abort(errorMsg, e);
3932     }
3933   }
3934 
3935   /**
3936    * Set region as OFFLINED up in zookeeper asynchronously.
3937    * @param state
3938    * @return True if we succeeded, false otherwise (State was incorrect or failed
3939    * updating zk).
3940    */
3941   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3942       final AsyncCallback.StringCallback cb, final ServerName destination) {
3943     if (!state.isClosed() && !state.isOffline()) {
3944       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3945         new IllegalStateException());
3946       return false;
3947     }
3948     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
3949     try {
3950       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3951         destination, cb, state);
3952     } catch (KeeperException e) {
3953       if (e instanceof NodeExistsException) {
3954         LOG.warn("Node for " + state.getRegion() + " already exists");
3955       } else {
3956         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3957       }
3958       return false;
3959     }
3960     return true;
3961   }
3962 
3963   private boolean deleteNodeInStates(String encodedName,
3964       String desc, ServerName sn, EventType... types) {
3965     try {
3966       for (EventType et: types) {
3967         if (ZKAssign.deleteNode(watcher, encodedName, et, sn)) {
3968           return true;
3969         }
3970       }
3971       LOG.info("Failed to delete the " + desc + " node for "
3972         + encodedName + ". The node type may not match");
3973     } catch (NoNodeException e) {
3974       if (LOG.isDebugEnabled()) {
3975         LOG.debug("The " + desc + " node for " + encodedName + " already deleted");
3976       }
3977     } catch (KeeperException ke) {
3978       server.abort("Unexpected ZK exception deleting " + desc
3979         + " node for the region " + encodedName, ke);
3980     }
3981     return false;
3982   }
3983 
3984   private void deleteMergingNode(String encodedName, ServerName sn) {
3985     deleteNodeInStates(encodedName, "merging", sn, EventType.RS_ZK_REGION_MERGING,
3986       EventType.RS_ZK_REQUEST_REGION_MERGE, EventType.RS_ZK_REGION_MERGED);
3987   }
3988 
3989   private void deleteSplittingNode(String encodedName, ServerName sn) {
3990     deleteNodeInStates(encodedName, "splitting", sn, EventType.RS_ZK_REGION_SPLITTING,
3991       EventType.RS_ZK_REQUEST_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT);
3992   }
3993 
3994   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
3995       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
3996       justification="Modification of Maps not ATOMIC!!!! FIX!!!")
3997   private void onRegionFailedOpen(
3998       final HRegionInfo hri, final ServerName sn) {
3999     String encodedName = hri.getEncodedName();
4000     // FindBugs: AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION Worth fixing!!!
4001     AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
4002     if (failedOpenCount == null) {
4003       failedOpenCount = new AtomicInteger();
4004       // No need to use putIfAbsent, or extra synchronization since
4005       // this whole handleRegion block is locked on the encoded region
4006       // name, and failedOpenTracker is updated only in this block
4007       // FindBugs: AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION
4008       failedOpenTracker.put(encodedName, failedOpenCount);
4009     }
4010     if (failedOpenCount.incrementAndGet() >= maximumAttempts && !hri.isMetaRegion()) {
4011       // FindBugs: AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION
4012       regionStates.updateRegionState(hri, State.FAILED_OPEN);
4013       // remove the tracking info to save memory, also reset
4014       // the count for next open initiative
4015       failedOpenTracker.remove(encodedName);
4016     } else {
4017       if (hri.isMetaRegion() && failedOpenCount.get() >= maximumAttempts) {
4018         // Log a warning message if a meta region failedOpenCount exceeds maximumAttempts
4019         // so that we are aware of potential problem if it persists for a long time.
4020         LOG.warn("Failed to open the hbase:meta region " +
4021             hri.getRegionNameAsString() + " after" +
4022             failedOpenCount.get() + " retries. Continue retrying.");
4023       }
4024 
4025       // Handle this the same as if it were opened and then closed.
4026       RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED);
4027       if (regionState != null) {
4028         // When there are more than one region server a new RS is selected as the
4029         // destination and the same is updated in the region plan. (HBASE-5546)
4030         if (getTableStateManager().isTableState(hri.getTable(),
4031             ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
4032             replicasToClose.contains(hri)) {
4033           offlineDisabledRegion(hri);
4034           return;
4035         }
4036         // ZK Node is in CLOSED state, assign it.
4037          regionStates.updateRegionState(hri, RegionState.State.CLOSED);
4038         // This below has to do w/ online enable/disable of a table
4039         removeClosedRegion(hri);
4040         getRegionPlan(hri, sn, true);
4041         invokeAssign(hri, false);
4042       }
4043     }
4044   }
4045 
4046   private void onRegionOpen(final HRegionInfo hri, final ServerName sn, long openSeqNum) {
4047     regionOnline(hri, sn, openSeqNum);
4048     if (useZKForAssignment) {
4049       try {
4050         // Delete the ZNode if exists
4051         ZKAssign.deleteNodeFailSilent(watcher, hri);
4052       } catch (KeeperException ke) {
4053         server.abort("Unexpected ZK exception deleting node " + hri, ke);
4054       }
4055     }
4056 
4057     // reset the count, if any
4058     failedOpenTracker.remove(hri.getEncodedName());
4059     if (getTableStateManager().isTableState(hri.getTable(),
4060         ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4061       invokeUnAssign(hri);
4062     }
4063   }
4064 
4065   private void onRegionClosed(final HRegionInfo hri) {
4066     if (getTableStateManager().isTableState(hri.getTable(),
4067         ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
4068         replicasToClose.contains(hri)) {
4069       offlineDisabledRegion(hri);
4070       return;
4071     }
4072     regionStates.updateRegionState(hri, RegionState.State.CLOSED);
4073     sendRegionClosedNotification(hri);
4074     // This below has to do w/ online enable/disable of a table
4075     removeClosedRegion(hri);
4076     invokeAssign(hri, false);
4077   }
4078 
4079   private String checkInStateForSplit(ServerName sn,
4080       final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) {
4081     final RegionState rs_p = regionStates.getRegionState(p);
4082     RegionState rs_a = regionStates.getRegionState(a);
4083     RegionState rs_b = regionStates.getRegionState(b);
4084     if (!(rs_p.isOpenOrSplittingOnServer(sn)
4085         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
4086         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
4087       return "Not in state good for split";
4088     }
4089     return "";
4090   }
4091 
4092   private String onRegionSplitReverted(ServerName sn,
4093       final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) {
4094     String s = checkInStateForSplit(sn, p, a, b);
4095     if (!org.apache.commons.lang.StringUtils.isEmpty(s)) {
4096       return s;
4097     }
4098 
4099     // Always bring the parent back online. Even if it's not offline
4100     // There's no harm in making it online again.
4101     regionOnline(p, sn);
4102 
4103     // Only offline the region if they are known to exist.
4104     RegionState regionStateA = regionStates.getRegionState(a);
4105     RegionState regionStateB = regionStates.getRegionState(b);
4106     if (regionStateA != null) {
4107       regionOffline(a);
4108     }
4109     if (regionStateB != null) {
4110       regionOffline(b);
4111     }
4112 
4113     if (getTableStateManager().isTableState(p.getTable(),
4114         ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4115       invokeUnAssign(p);
4116     }
4117     return null;
4118   }
4119 
4120   private String onRegionSplit(ServerName sn, TransitionCode code,
4121       final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) {
4122     String s = checkInStateForSplit(sn, p, a, b);
4123     if (!org.apache.commons.lang.StringUtils.isEmpty(s)) {
4124       return s;
4125     }
4126     regionStates.updateRegionState(a, State.SPLITTING_NEW, sn);
4127     regionStates.updateRegionState(b, State.SPLITTING_NEW, sn);
4128     regionStates.updateRegionState(p, State.SPLITTING);
4129 
4130     if (code == TransitionCode.SPLIT) {
4131       if (TEST_SKIP_SPLIT_HANDLING) {
4132         return "Skipping split message, TEST_SKIP_SPLIT_HANDLING is set";
4133       }
4134       regionOffline(p, State.SPLIT);
4135       regionOnline(a, sn, 1);
4136       regionOnline(b, sn, 1);
4137 
4138       // User could disable the table before master knows the new region.
4139       if (getTableStateManager().isTableState(p.getTable(),
4140           ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4141         invokeUnAssign(a);
4142         invokeUnAssign(b);
4143       } else {
4144         Callable<Object> splitReplicasCallable = new Callable<Object>() {
4145           @Override
4146           public Object call() {
4147             doSplittingOfReplicas(p, a, b);
4148             return null;
4149           }
4150         };
4151         threadPoolExecutorService.submit(splitReplicasCallable);
4152       }
4153     } else if (code == TransitionCode.SPLIT_PONR) {
4154       try {
4155         regionStates.splitRegion(p, a, b, sn);
4156       } catch (IOException ioe) {
4157         LOG.info("Failed to record split region " + p.getShortNameToLog());
4158         return "Failed to record the splitting in meta";
4159       }
4160     }
4161     return null;
4162   }
4163 
4164   private String onRegionMerge(ServerName sn, TransitionCode code,
4165       final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) {
4166     RegionState rs_p = regionStates.getRegionState(p);
4167     RegionState rs_a = regionStates.getRegionState(a);
4168     RegionState rs_b = regionStates.getRegionState(b);
4169     if (!(rs_a.isOpenOrMergingOnServer(sn) && rs_b.isOpenOrMergingOnServer(sn)
4170         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
4171       return "Not in state good for merge";
4172     }
4173     regionStates.updateRegionState(a, State.MERGING);
4174     regionStates.updateRegionState(b, State.MERGING);
4175     regionStates.updateRegionState(p, State.MERGING_NEW, sn);
4176 
4177     String encodedName = p.getEncodedName();
4178     if (code == TransitionCode.READY_TO_MERGE) {
4179       mergingRegions.put(encodedName,
4180         new PairOfSameType<HRegionInfo>(a, b));
4181     } else if (code == TransitionCode.MERGED) {
4182 
4183       if (TEST_SKIP_MERGE_HANDLING) {
4184         return "Skipping merge message, TEST_SKIP_MERGE_HANDLING is set for merge parent: " + p;
4185       }
4186 
4187       mergingRegions.remove(encodedName);
4188       regionOffline(a, State.MERGED);
4189       regionOffline(b, State.MERGED);
4190       regionOnline(p, sn, 1);
4191 
4192       // User could disable the table before master knows the new region.
4193       if (getTableStateManager().isTableState(p.getTable(),
4194           ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4195         invokeUnAssign(p);
4196       } else {
4197         Callable<Object> mergeReplicasCallable = new Callable<Object>() {
4198           @Override
4199           public Object call() {
4200             doMergingOfReplicas(p, a, b);
4201             return null;
4202           }
4203         };
4204         threadPoolExecutorService.submit(mergeReplicasCallable);
4205       }
4206     } else if (code == TransitionCode.MERGE_PONR) {
4207       try {
4208         regionStates.mergeRegions(p, a, b, sn);
4209       } catch (IOException ioe) {
4210         LOG.info("Failed to record merged region " + p.getShortNameToLog());
4211         return "Failed to record the merging in meta";
4212       }
4213     }
4214     return null;
4215   }
4216 
4217   private String onRegionMergeReverted(ServerName sn, TransitionCode code,
4218 	      final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) {
4219     RegionState rs_p = regionStates.getRegionState(p);
4220     String encodedName = p.getEncodedName();
4221     mergingRegions.remove(encodedName);
4222 
4223     // Always bring the children back online. Even if they are not offline
4224     // there's no harm in making them online again.
4225     regionOnline(a, sn);
4226     regionOnline(b, sn);
4227 
4228     // Only offline the merging region if it is known to exist.
4229     if (rs_p != null) {
4230       regionOffline(p);
4231     }
4232 
4233     if (getTableStateManager().isTableState(p.getTable(),
4234         ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4235       invokeUnAssign(a);
4236       invokeUnAssign(b);
4237     }
4238 
4239     return null;
4240   }
4241 
4242   /**
4243    * A helper to handle region merging transition event.
4244    * It transitions merging regions to MERGING state.
4245    */
4246   private boolean handleRegionMerging(final RegionTransition rt, final String encodedName,
4247       final String prettyPrintedRegionName, final ServerName sn) {
4248     if (!serverManager.isServerOnline(sn)) {
4249       LOG.warn("Dropped merging! ServerName=" + sn + " unknown.");
4250       return false;
4251     }
4252     byte [] payloadOfMerging = rt.getPayload();
4253     List<HRegionInfo> mergingRegions;
4254     try {
4255       mergingRegions = HRegionInfo.parseDelimitedFrom(
4256         payloadOfMerging, 0, payloadOfMerging.length);
4257     } catch (IOException e) {
4258       LOG.error("Dropped merging! Failed reading "  + rt.getEventType()
4259         + " payload for " + prettyPrintedRegionName);
4260       return false;
4261     }
4262     assert mergingRegions.size() == 3;
4263     HRegionInfo p = mergingRegions.get(0);
4264     HRegionInfo hri_a = mergingRegions.get(1);
4265     HRegionInfo hri_b = mergingRegions.get(2);
4266 
4267     RegionState rs_p = regionStates.getRegionState(p);
4268     RegionState rs_a = regionStates.getRegionState(hri_a);
4269     RegionState rs_b = regionStates.getRegionState(hri_b);
4270 
4271     if (!((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
4272         && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn))
4273         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
4274       LOG.warn("Dropped merging! Not in state good for MERGING; rs_p="
4275         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
4276       return false;
4277     }
4278 
4279     EventType et = rt.getEventType();
4280     if (et == EventType.RS_ZK_REQUEST_REGION_MERGE) {
4281       try {
4282         RegionMergeCoordination.RegionMergeDetails std =
4283             ((BaseCoordinatedStateManager) server.getCoordinatedStateManager())
4284                 .getRegionMergeCoordination().getDefaultDetails();
4285         ((BaseCoordinatedStateManager) server.getCoordinatedStateManager())
4286             .getRegionMergeCoordination().processRegionMergeRequest(p, hri_a, hri_b, sn, std);
4287         if (((ZkRegionMergeCoordination.ZkRegionMergeDetails) std).getZnodeVersion() == -1) {
4288           byte[] data = ZKAssign.getData(watcher, encodedName);
4289          EventType currentType = null;
4290           if (data != null) {
4291             RegionTransition newRt = RegionTransition.parseFrom(data);
4292             currentType = newRt.getEventType();
4293           }
4294           if (currentType == null || (currentType != EventType.RS_ZK_REGION_MERGED
4295               && currentType != EventType.RS_ZK_REGION_MERGING)) {
4296             LOG.warn("Failed to transition pending_merge node "
4297               + encodedName + " to merging, it's now " + currentType);
4298             return false;
4299           }
4300         }
4301       } catch (Exception e) {
4302         LOG.warn("Failed to transition pending_merge node "
4303           + encodedName + " to merging", e);
4304         return false;
4305       }
4306     }
4307 
4308     synchronized (regionStates) {
4309       regionStates.updateRegionState(hri_a, State.MERGING);
4310       regionStates.updateRegionState(hri_b, State.MERGING);
4311       regionStates.updateRegionState(p, State.MERGING_NEW, sn);
4312 
4313       if (TEST_SKIP_MERGE_HANDLING) {
4314         LOG.warn("Skipping merge message, TEST_SKIP_MERGE_HANDLING is set for merge parent: " + p);
4315         return true; // return true so that the merging node stays
4316       }
4317 
4318       if (et != EventType.RS_ZK_REGION_MERGED) {
4319         this.mergingRegions.put(encodedName,
4320           new PairOfSameType<HRegionInfo>(hri_a, hri_b));
4321       } else {
4322         this.mergingRegions.remove(encodedName);
4323         regionOffline(hri_a, State.MERGED);
4324         regionOffline(hri_b, State.MERGED);
4325         regionOnline(p, sn);
4326       }
4327     }
4328 
4329     if (et == EventType.RS_ZK_REGION_MERGED) {
4330       doMergingOfReplicas(p, hri_a, hri_b);
4331       LOG.debug("Handling MERGED event for " + encodedName + "; deleting node");
4332       // Remove region from ZK
4333       try {
4334         boolean successful = false;
4335         while (!successful) {
4336           // It's possible that the RS tickles in between the reading of the
4337           // znode and the deleting, so it's safe to retry.
4338           successful = ZKAssign.deleteNode(watcher, encodedName,
4339             EventType.RS_ZK_REGION_MERGED, sn);
4340         }
4341       } catch (KeeperException e) {
4342         if (e instanceof NoNodeException) {
4343           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
4344           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
4345         } else {
4346           server.abort("Error deleting MERGED node " + encodedName, e);
4347         }
4348       }
4349       LOG.info("Handled MERGED event; merged=" + p.getRegionNameAsString()
4350         + ", region_a=" + hri_a.getRegionNameAsString() + ", region_b="
4351         + hri_b.getRegionNameAsString() + ", on " + sn);
4352 
4353       // User could disable the table before master knows the new region.
4354       if (tableStateManager.isTableState(p.getTable(),
4355           ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4356         unassign(p);
4357       }
4358     }
4359     return true;
4360   }
4361 
4362   /**
4363    * A helper to handle region splitting transition event.
4364    */
4365   private boolean handleRegionSplitting(final RegionTransition rt, final String encodedName,
4366       final String prettyPrintedRegionName, final ServerName sn) {
4367     if (!serverManager.isServerOnline(sn)) {
4368       LOG.warn("Dropped splitting! ServerName=" + sn + " unknown.");
4369       return false;
4370     }
4371     byte [] payloadOfSplitting = rt.getPayload();
4372     List<HRegionInfo> splittingRegions;
4373     try {
4374       splittingRegions = HRegionInfo.parseDelimitedFrom(
4375         payloadOfSplitting, 0, payloadOfSplitting.length);
4376     } catch (IOException e) {
4377       LOG.error("Dropped splitting! Failed reading " + rt.getEventType()
4378         + " payload for " + prettyPrintedRegionName);
4379       return false;
4380     }
4381     assert splittingRegions.size() == 2;
4382     HRegionInfo hri_a = splittingRegions.get(0);
4383     HRegionInfo hri_b = splittingRegions.get(1);
4384 
4385     RegionState rs_p = regionStates.getRegionState(encodedName);
4386     RegionState rs_a = regionStates.getRegionState(hri_a);
4387     RegionState rs_b = regionStates.getRegionState(hri_b);
4388 
4389     if (!((rs_p == null || rs_p.isOpenOrSplittingOnServer(sn))
4390         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
4391         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
4392       LOG.warn("Dropped splitting! Not in state good for SPLITTING; rs_p="
4393         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
4394       return false;
4395     }
4396 
4397     if (rs_p == null) {
4398       // Splitting region should be online
4399       rs_p = regionStates.updateRegionState(rt, State.OPEN);
4400       if (rs_p == null) {
4401         LOG.warn("Received splitting for region " + prettyPrintedRegionName
4402           + " from server " + sn + " but it doesn't exist anymore,"
4403           + " probably already processed its split");
4404         return false;
4405       }
4406       regionStates.regionOnline(rs_p.getRegion(), sn);
4407     }
4408 
4409     HRegionInfo p = rs_p.getRegion();
4410     EventType et = rt.getEventType();
4411     if (et == EventType.RS_ZK_REQUEST_REGION_SPLIT) {
4412       try {
4413         SplitTransactionDetails std =
4414             ((BaseCoordinatedStateManager) server.getCoordinatedStateManager())
4415                 .getSplitTransactionCoordination().getDefaultDetails();
4416         if (((BaseCoordinatedStateManager) server.getCoordinatedStateManager())
4417             .getSplitTransactionCoordination().processTransition(p, hri_a, hri_b, sn, std) == -1) {
4418           byte[] data = ZKAssign.getData(watcher, encodedName);
4419           EventType currentType = null;
4420           if (data != null) {
4421             RegionTransition newRt = RegionTransition.parseFrom(data);
4422             currentType = newRt.getEventType();
4423           }
4424           if (currentType == null
4425               || (currentType != EventType.RS_ZK_REGION_SPLIT && currentType != EventType.RS_ZK_REGION_SPLITTING)) {
4426             LOG.warn("Failed to transition pending_split node " + encodedName
4427                 + " to splitting, it's now " + currentType);
4428             return false;
4429           }
4430         }
4431       } catch (Exception e) {
4432         LOG.warn("Failed to transition pending_split node " + encodedName + " to splitting", e);
4433         return false;
4434       }
4435     }
4436 
4437     synchronized (regionStates) {
4438       splitRegions.put(p, new PairOfSameType<HRegionInfo>(hri_a, hri_b));
4439       regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn);
4440       regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn);
4441       regionStates.updateRegionState(rt, State.SPLITTING);
4442 
4443       // The below is for testing ONLY!  We can't do fault injection easily, so
4444       // resort to this kinda uglyness -- St.Ack 02/25/2011.
4445       if (TEST_SKIP_SPLIT_HANDLING) {
4446         LOG.warn("Skipping split message, TEST_SKIP_SPLIT_HANDLING is set");
4447         return true; // return true so that the splitting node stays
4448       }
4449 
4450       if (et == EventType.RS_ZK_REGION_SPLIT) {
4451         regionOffline(p, State.SPLIT);
4452         regionOnline(hri_a, sn);
4453         regionOnline(hri_b, sn);
4454         splitRegions.remove(p);
4455       }
4456     }
4457 
4458     if (et == EventType.RS_ZK_REGION_SPLIT) {
4459       // split replicas
4460       doSplittingOfReplicas(rs_p.getRegion(), hri_a, hri_b);
4461       LOG.debug("Handling SPLIT event for " + encodedName + "; deleting node");
4462       // Remove region from ZK
4463       try {
4464         boolean successful = false;
4465         while (!successful) {
4466           // It's possible that the RS tickles in between the reading of the
4467           // znode and the deleting, so it's safe to retry.
4468           successful = ZKAssign.deleteNode(watcher, encodedName,
4469             EventType.RS_ZK_REGION_SPLIT, sn);
4470         }
4471       } catch (KeeperException e) {
4472         if (e instanceof NoNodeException) {
4473           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
4474           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
4475         } else {
4476           server.abort("Error deleting SPLIT node " + encodedName, e);
4477         }
4478       }
4479       LOG.info("Handled SPLIT event; parent=" + p.getRegionNameAsString()
4480         + ", daughter a=" + hri_a.getRegionNameAsString() + ", daughter b="
4481         + hri_b.getRegionNameAsString() + ", on " + sn);
4482 
4483       // User could disable the table before master knows the new region.
4484       if (tableStateManager.isTableState(p.getTable(),
4485           ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
4486         unassign(hri_a);
4487         unassign(hri_b);
4488       }
4489     }
4490     return true;
4491   }
4492 
4493   private void doMergingOfReplicas(HRegionInfo mergedHri, final HRegionInfo hri_a,
4494       final HRegionInfo hri_b) {
4495     // Close replicas for the original unmerged regions. create/assign new replicas
4496     // for the merged parent.
4497     List<HRegionInfo> unmergedRegions = new ArrayList<HRegionInfo>();
4498     unmergedRegions.add(hri_a);
4499     unmergedRegions.add(hri_b);
4500     Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(unmergedRegions);
4501     Collection<List<HRegionInfo>> c = map.values();
4502     for (List<HRegionInfo> l : c) {
4503       for (HRegionInfo h : l) {
4504         if (!RegionReplicaUtil.isDefaultReplica(h)) {
4505           LOG.debug("Unassigning un-merged replica " + h);
4506           unassign(h);
4507         }
4508       }
4509     }
4510     int numReplicas = 1;
4511     try {
4512       numReplicas = server.getTableDescriptors().get(mergedHri.getTable()).
4513           getRegionReplication();
4514     } catch (IOException e) {
4515       LOG.warn("Couldn't get the replication attribute of the table " + mergedHri.getTable() +
4516           " due to " + e.getMessage() + ". The assignment of replicas for the merged region " +
4517           "will not be done");
4518     }
4519     List<HRegionInfo> regions = new ArrayList<HRegionInfo>();
4520     for (int i = 1; i < numReplicas; i++) {
4521       regions.add(RegionReplicaUtil.getRegionInfoForReplica(mergedHri, i));
4522     }
4523     try {
4524       assign(regions);
4525     } catch (IOException ioe) {
4526       LOG.warn("Couldn't assign all replica(s) of region " + mergedHri + " because of " +
4527                 ioe.getMessage());
4528     } catch (InterruptedException ie) {
4529       LOG.warn("Couldn't assign all replica(s) of region " + mergedHri+ " because of " +
4530                 ie.getMessage());
4531     }
4532     // Remove merged region's replica from AM's memory
4533     clearReplicaRegions(c);
4534   }
4535 
4536   private void doSplittingOfReplicas(final HRegionInfo parentHri, final HRegionInfo hri_a,
4537       final HRegionInfo hri_b) {
4538     // create new regions for the replica, and assign them to match with the
4539     // current replica assignments. If replica1 of parent is assigned to RS1,
4540     // the replica1s of daughters will be on the same machine
4541     int numReplicas = 1;
4542     try {
4543       numReplicas = server.getTableDescriptors().get(parentHri.getTable()).
4544           getRegionReplication();
4545     } catch (IOException e) {
4546       LOG.warn("Couldn't get the replication attribute of the table " + parentHri.getTable() +
4547           " due to " + e.getMessage() + ". The assignment of daughter replicas " +
4548           "replicas will not be done");
4549     }
4550     // unassign the old replicas
4551     List<HRegionInfo> parentRegion = new ArrayList<HRegionInfo>();
4552     parentRegion.add(parentHri);
4553     Map<ServerName, List<HRegionInfo>> currentAssign =
4554         regionStates.getRegionAssignments(parentRegion);
4555     Collection<List<HRegionInfo>> c = currentAssign.values();
4556     for (List<HRegionInfo> l : c) {
4557       for (HRegionInfo h : l) {
4558         if (!RegionReplicaUtil.isDefaultReplica(h)) {
4559           LOG.debug("Unassigning parent's replica " + h);
4560           unassign(h);
4561         }
4562       }
4563     }
4564     // assign daughter replicas
4565     Map<HRegionInfo, ServerName> map = new HashMap<HRegionInfo, ServerName>();
4566     for (int i = 1; i < numReplicas; i++) {
4567       prepareDaughterReplicaForAssignment(hri_a, parentHri, i, map);
4568       prepareDaughterReplicaForAssignment(hri_b, parentHri, i, map);
4569     }
4570     try {
4571       assign(map);
4572     } catch (IOException e) {
4573       LOG.warn("Caught exception " + e + " while trying to assign replica(s) of daughter(s)");
4574     } catch (InterruptedException e) {
4575       LOG.warn("Caught exception " + e + " while trying to assign replica(s) of daughter(s)");
4576     }
4577     // Remove parent region's replica from AM's memory
4578     clearReplicaRegions(c);
4579   }
4580 
4581   /*
4582    * Clear the replica regions after region split or merge.
4583    */
4584   private void clearReplicaRegions(Collection<List<HRegionInfo>> regionInfos) {
4585     for (List<HRegionInfo> regionInfoList : regionInfos) {
4586       for (HRegionInfo regionInfo : regionInfoList) {
4587         if (!RegionReplicaUtil.isDefaultReplica(regionInfo)) {
4588           regionStates.deleteRegion(regionInfo);
4589         }
4590       }
4591     }
4592   }
4593 
4594   private void prepareDaughterReplicaForAssignment(HRegionInfo daughterHri, HRegionInfo parentHri,
4595       int replicaId, Map<HRegionInfo, ServerName> map) {
4596     HRegionInfo parentReplica = RegionReplicaUtil.getRegionInfoForReplica(parentHri, replicaId);
4597     HRegionInfo daughterReplica = RegionReplicaUtil.getRegionInfoForReplica(daughterHri,
4598         replicaId);
4599     LOG.debug("Created replica region for daughter " + daughterReplica);
4600     ServerName sn;
4601     if ((sn = regionStates.getRegionServerOfRegion(parentReplica)) != null) {
4602       map.put(daughterReplica, sn);
4603     } else {
4604       List<ServerName> servers = serverManager.getOnlineServersList();
4605       sn = servers.get((new Random(System.currentTimeMillis())).nextInt(servers.size()));
4606       map.put(daughterReplica, sn);
4607     }
4608   }
4609 
4610   public Set<HRegionInfo> getReplicasToClose() {
4611     return replicasToClose;
4612   }
4613 
4614   public Map<String, AtomicInteger> getFailedOpenTracker() {return failedOpenTracker;}
4615 
4616   private void regionOffline(final HRegionInfo regionInfo, final State state) {
4617     regionOffline(regionInfo, state, false);
4618   }
4619 
4620   /**
4621    * A region is offline.  The new state should be the specified one,
4622    * if not null.  If the specified state is null, the new state is Offline.
4623    * The specified state can be Split/Merged/Offline/null only.
4624    *
4625    * If region offline is initiated by rpc call from admin, we force offline it.
4626    */
4627   private void regionOffline(final HRegionInfo regionInfo, final State state,
4628       final boolean force) {
4629     regionStates.regionOffline(regionInfo, state, force);
4630     removeClosedRegion(regionInfo);
4631     // remove the region plan as well just in case.
4632     clearRegionPlan(regionInfo);
4633     balancer.regionOffline(regionInfo);
4634 
4635     // Tell our listeners that a region was closed
4636     sendRegionClosedNotification(regionInfo);
4637     // also note that all the replicas of the primary should be closed
4638     if (force || (state != null && state.equals(State.SPLIT))) {
4639       Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
4640       c.add(regionInfo);
4641       Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
4642       Collection<List<HRegionInfo>> allReplicas = map.values();
4643       for (List<HRegionInfo> list : allReplicas) {
4644         replicasToClose.addAll(list);
4645       }
4646     }
4647     else if (force || (state != null && state.equals(State.MERGED))) {
4648       Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
4649       c.add(regionInfo);
4650       Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
4651       Collection<List<HRegionInfo>> allReplicas = map.values();
4652       for (List<HRegionInfo> list : allReplicas) {
4653         replicasToClose.addAll(list);
4654       }
4655     }
4656   }
4657 
4658   private void sendRegionOpenedNotification(final HRegionInfo regionInfo,
4659       final ServerName serverName) {
4660     if (!this.listeners.isEmpty()) {
4661       for (AssignmentListener listener : this.listeners) {
4662         listener.regionOpened(regionInfo, serverName);
4663       }
4664     }
4665   }
4666 
4667   private void sendRegionClosedNotification(final HRegionInfo regionInfo) {
4668     if (!this.listeners.isEmpty()) {
4669       for (AssignmentListener listener : this.listeners) {
4670         listener.regionClosed(regionInfo);
4671       }
4672     }
4673   }
4674 
4675   /**
4676    * Try to update some region states. If the state machine prevents
4677    * such update, an error message is returned to explain the reason.
4678    *
4679    * It's expected that in each transition there should have just one
4680    * region for opening/closing, 3 regions for splitting/merging.
4681    * These regions should be on the server that requested the change.
4682    *
4683    * Region state machine. Only these transitions
4684    * are expected to be triggered by a region server.
4685    *
4686    * On the state transition:
4687    *  (1) Open/Close should be initiated by master
4688    *      (a) Master sets the region to pending_open/pending_close
4689    *        in memory and hbase:meta after sending the request
4690    *        to the region server
4691    *      (b) Region server reports back to the master
4692    *        after open/close is done (either success/failure)
4693    *      (c) If region server has problem to report the status
4694    *        to master, it must be because the master is down or some
4695    *        temporary network issue. Otherwise, the region server should
4696    *        abort since it must be a bug. If the master is not accessible,
4697    *        the region server should keep trying until the server is
4698    *        stopped or till the status is reported to the (new) master
4699    *      (d) If region server dies in the middle of opening/closing
4700    *        a region, SSH picks it up and finishes it
4701    *      (e) If master dies in the middle, the new master recovers
4702    *        the state during initialization from hbase:meta. Region server
4703    *        can report any transition that has not been reported to
4704    *        the previous active master yet
4705    *  (2) Split/merge is initiated by region servers
4706    *      (a) To split a region, a region server sends a request
4707    *        to master to try to set a region to splitting, together with
4708    *        two daughters (to be created) to splitting new. If approved
4709    *        by the master, the splitting can then move ahead
4710    *      (b) To merge two regions, a region server sends a request to
4711    *        master to try to set the new merged region (to be created) to
4712    *        merging_new, together with two regions (to be merged) to merging.
4713    *        If it is ok with the master, the merge can then move ahead
4714    *      (c) Once the splitting/merging is done, the region server
4715    *        reports the status back to the master either success/failure.
4716    *      (d) Other scenarios should be handled similarly as for
4717    *        region open/close
4718    */
4719   protected String onRegionTransition(final ServerName serverName,
4720       final RegionStateTransition transition) {
4721     TransitionCode code = transition.getTransitionCode();
4722     HRegionInfo hri = HRegionInfo.convert(transition.getRegionInfo(0));
4723     RegionState current = regionStates.getRegionState(hri);
4724     if (LOG.isDebugEnabled()) {
4725       LOG.debug("Got transition " + code + " for "
4726         + (current != null ? current.toString() : hri.getShortNameToLog())
4727         + " from " + serverName);
4728     }
4729     String errorMsg = null;
4730     switch (code) {
4731     case OPENED:
4732       if (current != null && current.isOpened() && current.isOnServer(serverName)) {
4733         LOG.info("Region " + hri.getShortNameToLog() + " is already " + current.getState() + " on "
4734             + serverName);
4735         break;
4736       }
4737     case FAILED_OPEN:
4738       if (current == null
4739           || !current.isPendingOpenOrOpeningOnServer(serverName)) {
4740         errorMsg = hri.getShortNameToLog()
4741           + " is not pending open on " + serverName;
4742       } else if (code == TransitionCode.FAILED_OPEN) {
4743         onRegionFailedOpen(hri, serverName);
4744       } else {
4745         long openSeqNum = HConstants.NO_SEQNUM;
4746         if (transition.hasOpenSeqNum()) {
4747           openSeqNum = transition.getOpenSeqNum();
4748         }
4749         if (openSeqNum < 0) {
4750           errorMsg = "Newly opened region has invalid open seq num " + openSeqNum;
4751         } else {
4752           onRegionOpen(hri, serverName, openSeqNum);
4753         }
4754       }
4755       break;
4756 
4757     case CLOSED:
4758       if (current == null
4759           || !current.isPendingCloseOrClosingOnServer(serverName)) {
4760         errorMsg = hri.getShortNameToLog()
4761           + " is not pending close on " + serverName;
4762       } else {
4763         onRegionClosed(hri);
4764       }
4765       break;
4766 
4767     case READY_TO_SPLIT:
4768       try {
4769         regionStateListener.onRegionSplit(hri);
4770         if (!((HMaster)server).getSplitOrMergeTracker().isSplitOrMergeEnabled(
4771                 Admin.MasterSwitchType.SPLIT)) {
4772           errorMsg = "split switch is off!";
4773           break;
4774         }
4775       } catch (IOException exp) {
4776         errorMsg = StringUtils.stringifyException(exp);
4777         break;
4778       }
4779       // Break out only for errors, otherwise fall through
4780     case SPLIT_PONR:
4781     case SPLIT:
4782       errorMsg =
4783       onRegionSplit(serverName, code, hri, HRegionInfo.convert(transition.getRegionInfo(1)),
4784         HRegionInfo.convert(transition.getRegionInfo(2)));
4785       break;
4786 
4787     case SPLIT_REVERTED:
4788       errorMsg =
4789           onRegionSplitReverted(serverName, hri,
4790             HRegionInfo.convert(transition.getRegionInfo(1)),
4791             HRegionInfo.convert(transition.getRegionInfo(2)));
4792       if (org.apache.commons.lang.StringUtils.isEmpty(errorMsg)) {
4793         try {
4794           regionStateListener.onRegionSplitReverted(hri);
4795         } catch (IOException exp) {
4796           LOG.warn(StringUtils.stringifyException(exp));
4797         }
4798       }
4799       break;
4800     case READY_TO_MERGE:
4801       if (!((HMaster)server).getSplitOrMergeTracker().isSplitOrMergeEnabled(
4802               Admin.MasterSwitchType.MERGE)) {
4803         errorMsg = "merge switch is off!";
4804         break;
4805       }
4806       // Break out only for errors, otherwise fall through
4807     case MERGE_PONR:
4808     case MERGED:
4809       errorMsg = onRegionMerge(serverName, code, hri,
4810         HRegionInfo.convert(transition.getRegionInfo(1)),
4811         HRegionInfo.convert(transition.getRegionInfo(2)));
4812       if (code == TransitionCode.MERGED && org.apache.commons.lang.StringUtils.isEmpty(errorMsg)) {
4813         try {
4814           regionStateListener.onRegionMerged(hri);
4815         } catch (IOException exp) {
4816           errorMsg = StringUtils.stringifyException(exp);
4817         }
4818       }
4819       break;
4820     case MERGE_REVERTED:
4821         errorMsg = onRegionMergeReverted(serverName, code, hri,
4822                 HRegionInfo.convert(transition.getRegionInfo(1)),
4823                 HRegionInfo.convert(transition.getRegionInfo(2)));
4824       break;
4825 
4826     default:
4827       errorMsg = "Unexpected transition code " + code;
4828     }
4829     if (errorMsg != null) {
4830       LOG.error("Failed to transtion region from " + current + " to "
4831         + code + " by " + serverName + ": " + errorMsg);
4832     }
4833     return errorMsg;
4834   }
4835 
4836   private void processBogusAssignments(Map<ServerName, List<HRegionInfo>> bulkPlan) {
4837     if (bulkPlan.containsKey(LoadBalancer.BOGUS_SERVER_NAME)) {
4838       // Found no plan for some regions, put those regions in RIT
4839       for (HRegionInfo hri : bulkPlan.get(LoadBalancer.BOGUS_SERVER_NAME)) {
4840         regionStates.updateRegionState(hri, State.FAILED_OPEN);
4841       }
4842       bulkPlan.remove(LoadBalancer.BOGUS_SERVER_NAME);
4843     }
4844   }
4845 
4846   /**
4847    * @return Instance of load balancer
4848    */
4849   public LoadBalancer getBalancer() {
4850     return this.balancer;
4851   }
4852 
4853   public Map<ServerName, List<HRegionInfo>>
4854     getSnapShotOfAssignment(Collection<HRegionInfo> infos) {
4855     return getRegionStates().getRegionAssignments(infos);
4856   }
4857 
4858   void setRegionStateListener(RegionStateListener listener) {
4859     this.regionStateListener = listener;
4860   }
4861 
4862   private class DelayedAssignCallable implements Runnable {
4863 
4864     Callable<?> callable;
4865 
4866     public DelayedAssignCallable(Callable<?> callable) {
4867       this.callable = callable;
4868     }
4869 
4870     @Override
4871     public void run() {
4872       threadPoolExecutorService.submit(callable);
4873     }
4874   }
4875 
4876   /*
4877    * This is only used for unit-testing split failures.
4878    */
4879   public static void setTestSkipSplitHandling(boolean skipSplitHandling) {
4880     TEST_SKIP_SPLIT_HANDLING = skipSplitHandling;
4881   }
4882 
4883   /*
4884    * This is only used for unit-testing merge failures.
4885    */
4886   public static void setTestSkipMergeHandling(boolean skipMergeHandling) {
4887     TEST_SKIP_MERGE_HANDLING = skipMergeHandling;
4888   }
4889 
4890   /**
4891    * Scheduled task that will attempt to redeploy regions that have transitioned permanently into
4892    * FAILED_OPEN state.
4893    */
4894   class FailedOpenRetryRunnable implements Runnable {
4895     @Override
4896     public void run() {
4897       // Kick regions that have been transitioned into permanent FAILED_OPEN state
4898       for (RegionState s: getRegionStates().getAllRegions()) {
4899         if (s.isFailedOpen()) {
4900           LOG.info("Retrying failed assignment for " + s.toDescriptiveString());
4901           // Run the entire unassign protocol for safety's sake
4902           unassign(s.getRegion());
4903         }
4904       }
4905     }
4906   }
4907 
4908 }