View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.procedure;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InterruptedIOException;
23  import java.io.OutputStream;
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  import java.util.concurrent.locks.Lock;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.ServerName;
38  import org.apache.hadoop.hbase.client.ClusterConnection;
39  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
40  import org.apache.hadoop.hbase.master.AssignmentManager;
41  import org.apache.hadoop.hbase.master.MasterFileSystem;
42  import org.apache.hadoop.hbase.master.MasterServices;
43  import org.apache.hadoop.hbase.master.RegionState;
44  import org.apache.hadoop.hbase.master.RegionStates;
45  import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
46  import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
47  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
48  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionInfo;
49  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos;
50  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.ServerCrashState;
51  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
52  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
53  import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
54  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
55  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
56  import org.apache.hadoop.util.StringUtils;
57  import org.apache.zookeeper.KeeperException;
58  
59  /**
60   * Handle crashed server. This is a port to ProcedureV2 of what used to be euphemistically called
61   * ServerShutdownHandler.
62   *
63   * <p>The procedure flow varies dependent on whether meta is assigned, if we are
64   * doing distributed log replay versus distributed log splitting, and if we are to split logs at
65   * all.
66   *
67   * <p>This procedure asks that all crashed servers get processed equally; we yield after the
68   * completion of each successful flow step. We do this so that we do not 'deadlock' waiting on
69   * a region assignment so we can replay edits which could happen if a region moved there are edits
70   * on two servers for replay.
71   *
72   * <p>TODO: ASSIGN and WAIT_ON_ASSIGN (at least) are not idempotent. Revisit when assign is pv2.
73   * TODO: We do not have special handling for system tables.
74   */
75  public class ServerCrashProcedure
76  extends StateMachineProcedure<MasterProcedureEnv, ServerCrashState>
77  implements ServerProcedureInterface {
78    private static final Log LOG = LogFactory.getLog(ServerCrashProcedure.class);
79  
80    /**
81     * Configuration key to set how long to wait in ms doing a quick check on meta state.
82     */
83    public static final String KEY_SHORT_WAIT_ON_META =
84        "hbase.master.servercrash.short.wait.on.meta.ms";
85  
86    public static final int DEFAULT_SHORT_WAIT_ON_META = 1000;
87  
88    /**
89     * Configuration key to set how many retries to cycle before we give up on meta.
90     * Each attempt will wait at least {@link #KEY_SHORT_WAIT_ON_META} milliseconds.
91     */
92    public static final String KEY_RETRIES_ON_META =
93        "hbase.master.servercrash.meta.retries";
94  
95    public static final int DEFAULT_RETRIES_ON_META = 10;
96  
97    /**
98     * Configuration key to set how long to wait in ms on regions in transition.
99     */
100   public static final String KEY_WAIT_ON_RIT =
101       "hbase.master.servercrash.wait.on.rit.ms";
102 
103   public static final int DEFAULT_WAIT_ON_RIT = 30000;
104 
105   private static final Set<HRegionInfo> META_REGION_SET = new HashSet<HRegionInfo>();
106   static {
107     META_REGION_SET.add(HRegionInfo.FIRST_META_REGIONINFO);
108   }
109 
110   /**
111    * Name of the crashed server to process.
112    */
113   private ServerName serverName;
114 
115   /**
116    * Whether DeadServer knows that we are processing it.
117    */
118   private boolean notifiedDeadServer = false;
119 
120   /**
121    * Regions that were on the crashed server.
122    */
123   private Set<HRegionInfo> regionsOnCrashedServer;
124 
125   /**
126    * Regions assigned. Usually some subset of {@link #regionsOnCrashedServer}.
127    */
128   private List<HRegionInfo> regionsAssigned;
129 
130   private boolean distributedLogReplay = false;
131   private boolean carryingMeta = false;
132   private boolean shouldSplitWal;
133 
134   /**
135    * Cycles on same state. Good for figuring if we are stuck.
136    */
137   private int cycles = 0;
138 
139   /**
140    * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful,
141    * move this back up into StateMachineProcedure
142    */
143   private int previousState;
144 
145   /**
146    * Call this constructor queuing up a Procedure.
147    * @param serverName Name of the crashed server.
148    * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
149    * @param carryingMeta True if carrying hbase:meta table region.
150    */
151   public ServerCrashProcedure(
152       final MasterProcedureEnv env,
153       final ServerName serverName,
154       final boolean shouldSplitWal,
155       final boolean carryingMeta) {
156     this.serverName = serverName;
157     this.shouldSplitWal = shouldSplitWal;
158     this.carryingMeta = carryingMeta;
159     this.setOwner(env.getRequestUser().getShortName());
160   }
161 
162   /**
163    * Used when deserializing from a procedure store; we'll construct one of these then call
164    * {@link #deserializeStateData(InputStream)}. Do not use directly.
165    */
166   public ServerCrashProcedure() {
167     super();
168   }
169 
170   private void throwProcedureYieldException(final String msg) throws ProcedureYieldException {
171     String logMsg = msg + "; cycle=" + this.cycles + ", running for " +
172         StringUtils.formatTimeDiff(System.currentTimeMillis(), getStartTime());
173     // The procedure executor logs ProcedureYieldException at trace level. For now, log these
174     // yields for server crash processing at DEBUG. Revisit when stable.
175     if (LOG.isDebugEnabled()) LOG.debug(logMsg);
176     throw new ProcedureYieldException(logMsg);
177   }
178 
179   @Override
180   protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
181       throws ProcedureYieldException {
182     if (LOG.isTraceEnabled()) {
183       LOG.trace(state);
184     }
185     // Keep running count of cycles
186     if (state.getNumber() != this.previousState) {
187       this.previousState = state.getNumber();
188       this.cycles = 0;
189     } else {
190       this.cycles++;
191     }
192     MasterServices services = env.getMasterServices();
193     // Is master fully online? If not, yield. No processing of servers unless master is up
194     if (!services.getAssignmentManager().isFailoverCleanupDone()) {
195       throwProcedureYieldException("Waiting on master failover to complete");
196     }
197     // HBASE-14802
198     // If we have not yet notified that we are processing a dead server, we should do now.
199     if (!notifiedDeadServer) {
200       services.getServerManager().getDeadServers().notifyServer(serverName);
201       notifiedDeadServer = true;
202     }
203 
204     try {
205       switch (state) {
206       case SERVER_CRASH_START:
207         LOG.info("Start processing crashed " + this.serverName);
208         start(env);
209         // If carrying meta, process it first. Else, get list of regions on crashed server.
210         if (this.carryingMeta) setNextState(ServerCrashState.SERVER_CRASH_PROCESS_META);
211         else setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
212         break;
213 
214       case SERVER_CRASH_GET_REGIONS:
215         // If hbase:meta is not assigned, yield.
216         if (!isMetaAssignedQuickTest(env)) {
217           // isMetaAssignedQuickTest does not really wait. Let's delay a little before
218           // another round of execution.
219           long wait =
220               env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META,
221                 DEFAULT_SHORT_WAIT_ON_META);
222           wait = wait / 10;
223           Thread.sleep(wait);
224           throwProcedureYieldException("Waiting on hbase:meta assignment");
225         }
226         this.regionsOnCrashedServer =
227             services.getAssignmentManager().getRegionStates().getServerRegions(this.serverName);
228         // Where to go next? Depends on whether we should split logs at all or if we should do
229         // distributed log splitting (DLS) vs distributed log replay (DLR).
230         if (!this.shouldSplitWal) {
231           setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
232         } else if (this.distributedLogReplay) {
233           setNextState(ServerCrashState.SERVER_CRASH_PREPARE_LOG_REPLAY);
234         } else {
235           setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
236         }
237         break;
238 
239       case SERVER_CRASH_PROCESS_META:
240         // If we fail processing hbase:meta, yield.
241         if (!processMeta(env)) {
242           throwProcedureYieldException("Waiting on regions-in-transition to clear");
243         }
244         setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
245         break;
246 
247       case SERVER_CRASH_PREPARE_LOG_REPLAY:
248         prepareLogReplay(env, this.regionsOnCrashedServer);
249         setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
250         break;
251 
252       case SERVER_CRASH_SPLIT_LOGS:
253         splitLogs(env);
254         // If DLR, go to FINISH. Otherwise, if DLS, go to SERVER_CRASH_ASSIGN
255         if (this.distributedLogReplay) setNextState(ServerCrashState.SERVER_CRASH_FINISH);
256         else setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
257         break;
258 
259       case SERVER_CRASH_ASSIGN:
260         List<HRegionInfo> regionsToAssign = calcRegionsToAssign(env);
261 
262         // Assign may not be idempotent. SSH used to requeue the SSH if we got an IOE assigning
263         // which is what we are mimicing here but it looks prone to double assignment if assign
264         // fails midway. TODO: Test.
265 
266         // If no regions to assign, skip assign and skip to the finish.
267         boolean regions = regionsToAssign != null && !regionsToAssign.isEmpty();
268         if (regions) {
269           this.regionsAssigned = regionsToAssign;
270           if (!assign(env, regionsToAssign)) {
271             throwProcedureYieldException("Failed assign; will retry");
272           }
273         }
274         if (this.shouldSplitWal && distributedLogReplay) {
275           // Take this route even if there are apparently no regions assigned. This may be our
276           // second time through here; i.e. we assigned and crashed just about here. On second
277           // time through, there will be no regions because we assigned them in the previous step.
278           // Even though no regions, we need to go through here to clean up the DLR zk markers.
279           setNextState(ServerCrashState.SERVER_CRASH_WAIT_ON_ASSIGN);
280         } else {
281           setNextState(ServerCrashState.SERVER_CRASH_FINISH);
282         }
283         break;
284 
285       case SERVER_CRASH_WAIT_ON_ASSIGN:
286         // TODO: The list of regionsAssigned may be more than we actually assigned. See down in
287         // AM #1629 around 'if (regionStates.wasRegionOnDeadServer(encodedName)) {' where where we
288         // will skip assigning a region because it is/was on a dead server. Should never happen!
289         // It was on this server. Worst comes to worst, we'll still wait here till other server is
290         // processed.
291 
292         // If the wait on assign failed, yield -- if we have regions to assign.
293         if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
294           if (!waitOnAssign(env, this.regionsAssigned)) {
295             throwProcedureYieldException("Waiting on region assign");
296           }
297         }
298         setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
299         break;
300 
301       case SERVER_CRASH_FINISH:
302         LOG.info("Finished processing of crashed " + serverName);
303         services.getServerManager().getDeadServers().finish(serverName);
304         return Flow.NO_MORE_STATE;
305 
306       default:
307         throw new UnsupportedOperationException("unhandled state=" + state);
308       }
309     } catch (ProcedureYieldException e) {
310       LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry "
311           + e.getMessage());
312       throw e;
313     } catch (IOException e) {
314       LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry", e);
315     } catch (InterruptedException e) {
316       // TODO: Make executor allow IEs coming up out of execute.
317       LOG.warn("Interrupted serverName=" + this.serverName + ", state=" + state + "; retry", e);
318       Thread.currentThread().interrupt();
319     }
320     return Flow.HAS_MORE_STATE;
321   }
322 
323   /**
324    * Start processing of crashed server. In here we'll just set configs. and return.
325    * @param env
326    * @throws IOException
327    */
328   private void start(final MasterProcedureEnv env) throws IOException {
329     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
330     // Set recovery mode late. This is what the old ServerShutdownHandler used do.
331     mfs.setLogRecoveryMode();
332     this.distributedLogReplay = mfs.getLogRecoveryMode() == RecoveryMode.LOG_REPLAY;
333   }
334 
335   /**
336    * @param env
337    * @return False if we fail to assign and split logs on meta ('process').
338    * @throws IOException
339    * @throws InterruptedException
340    */
341   private boolean processMeta(final MasterProcedureEnv env)
342   throws IOException {
343     if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
344     MasterServices services = env.getMasterServices();
345     MasterFileSystem mfs = services.getMasterFileSystem();
346     AssignmentManager am = services.getAssignmentManager();
347     HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO;
348     if (this.shouldSplitWal) {
349       if (this.distributedLogReplay) {
350         prepareLogReplay(env, META_REGION_SET);
351       } else {
352         // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
353         mfs.splitMetaLog(serverName);
354         am.getRegionStates().logSplit(metaHRI);
355       }
356     }
357 
358     // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
359     boolean processed = true;
360     boolean shouldAssignMeta = false;
361     AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName);
362       switch (rsCarryingMetaRegion) {
363         case HOSTING_REGION:
364           LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
365           am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
366           shouldAssignMeta = true;
367           break;
368         case UNKNOWN:
369           if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper())) {
370             // the meta location as per master is null. This could happen in case when meta
371             // assignment in previous run failed, while meta znode has been updated to null.
372             // We should try to assign the meta again.
373             shouldAssignMeta = true;
374             break;
375           }
376           // fall through
377         case NOT_HOSTING_REGION:
378           LOG.info("META has been assigned to otherwhere, skip assigning.");
379           break;
380         default:
381           throw new IOException("Unsupported action in MetaServerShutdownHandler");
382     }
383     if (shouldAssignMeta) {
384       // TODO: May block here if hard time figuring state of meta.
385       verifyAndAssignMetaWithRetries(env);
386       if (this.shouldSplitWal && distributedLogReplay) {
387         int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
388         if (!waitOnRegionToClearRegionsInTransition(am, metaHRI, timeout)) {
389           processed = false;
390         } else {
391           // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
392           mfs.splitMetaLog(serverName);
393         }
394       }
395     }
396     return processed;
397   }
398 
399   /**
400    * @return True if region cleared RIT, else false if we timed out waiting.
401    * @throws InterruptedIOException
402    */
403   private boolean waitOnRegionToClearRegionsInTransition(AssignmentManager am,
404       final HRegionInfo hri, final int timeout)
405   throws InterruptedIOException {
406     try {
407       if (!am.waitOnRegionToClearRegionsInTransition(hri, timeout)) {
408         // Wait here is to avoid log replay hits current dead server and incur a RPC timeout
409         // when replay happens before region assignment completes.
410         LOG.warn("Region " + hri.getEncodedName() + " didn't complete assignment in time");
411         return false;
412       }
413     } catch (InterruptedException ie) {
414       throw new InterruptedIOException("Caught " + ie +
415         " during waitOnRegionToClearRegionsInTransition for " + hri);
416     }
417     return true;
418   }
419 
420   private void prepareLogReplay(final MasterProcedureEnv env, final Set<HRegionInfo> regions)
421   throws IOException {
422     if (LOG.isDebugEnabled()) {
423       LOG.debug("Mark " + size(this.regionsOnCrashedServer) + " regions-in-recovery from " +
424         this.serverName);
425     }
426     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
427     AssignmentManager am = env.getMasterServices().getAssignmentManager();
428     mfs.prepareLogReplay(this.serverName, regions);
429     am.getRegionStates().logSplit(this.serverName);
430   }
431 
432   private void splitLogs(final MasterProcedureEnv env) throws IOException {
433     if (LOG.isDebugEnabled()) {
434       LOG.debug("Splitting logs from " + serverName + "; region count=" +
435         size(this.regionsOnCrashedServer));
436     }
437     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
438     AssignmentManager am = env.getMasterServices().getAssignmentManager();
439     // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
440     mfs.splitLog(this.serverName);
441     if (!carryingMeta) {
442       mfs.archiveMetaLog(this.serverName);
443     }
444     am.getRegionStates().logSplit(this.serverName);
445   }
446 
447   static int size(final Collection<HRegionInfo> hris) {
448     return hris == null? 0: hris.size();
449   }
450 
451   /**
452    * Figure out what we need to assign. Should be idempotent.
453    * @param env
454    * @return List of calculated regions to assign; may be empty or null.
455    * @throws IOException
456    */
457   private List<HRegionInfo> calcRegionsToAssign(final MasterProcedureEnv env)
458   throws IOException {
459     AssignmentManager am = env.getMasterServices().getAssignmentManager();
460     List<HRegionInfo> regionsToAssignAggregator = new ArrayList<HRegionInfo>();
461     int replicaCount = env.getMasterConfiguration().getInt(HConstants.META_REPLICAS_NUM,
462       HConstants.DEFAULT_META_REPLICA_NUM);
463     for (int i = 1; i < replicaCount; i++) {
464       HRegionInfo metaHri =
465           RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i);
466       if (am.isCarryingMetaReplica(this.serverName, metaHri) ==
467           AssignmentManager.ServerHostRegion.HOSTING_REGION) {
468         if (LOG.isDebugEnabled()) {
469           LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName);
470         }
471         regionsToAssignAggregator.add(metaHri);
472       }
473     }
474     // Clean out anything in regions in transition.
475     List<HRegionInfo> regionsInTransition = am.cleanOutCrashedServerReferences(serverName);
476     if (LOG.isDebugEnabled()) {
477       LOG.debug("Reassigning " + size(this.regionsOnCrashedServer) +
478         " region(s) that " + (serverName == null? "null": serverName)  +
479         " was carrying (and " + regionsInTransition.size() +
480         " regions(s) that were opening on this server)");
481     }
482     regionsToAssignAggregator.addAll(regionsInTransition);
483 
484     // Iterate regions that were on this server and figure which of these we need to reassign
485     if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
486       RegionStates regionStates = am.getRegionStates();
487       for (HRegionInfo hri: this.regionsOnCrashedServer) {
488         if (regionsInTransition.contains(hri)) continue;
489         String encodedName = hri.getEncodedName();
490         Lock lock = am.acquireRegionLock(encodedName);
491         try {
492           RegionState rit = regionStates.getRegionTransitionState(hri);
493           if (processDeadRegion(hri, am)) {
494             ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
495             if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
496               // If this region is in transition on the dead server, it must be
497               // opening or pending_open, which should have been covered by
498               // AM#cleanOutCrashedServerReferences
499               LOG.info("Skip assigning " + hri.getRegionNameAsString()
500                 + " because opened on " + addressFromAM.getServerName());
501               continue;
502             }
503             if (rit != null) {
504               if (rit.getServerName() != null && !rit.isOnServer(this.serverName)) {
505                 // Skip regions that are in transition on other server
506                 LOG.info("Skip assigning region in transition on other server" + rit);
507                 continue;
508               }
509               LOG.info("Reassigning region " + rit + " and clearing zknode if exists");
510               try {
511                 // This clears out any RIT that might be sticking around.
512                 ZKAssign.deleteNodeFailSilent(env.getMasterServices().getZooKeeper(), hri);
513               } catch (KeeperException e) {
514                 // TODO: FIX!!!! ABORTING SERVER BECAUSE COULDN"T PURGE ZNODE. This is what we
515                 // used to do but that doesn't make it right!!!
516                 env.getMasterServices().abort("Unexpected error deleting RIT " + hri, e);
517                 throw new IOException(e);
518               }
519               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
520             } else if (regionStates.isRegionInState(
521                 hri, RegionState.State.SPLITTING_NEW, RegionState.State.MERGING_NEW)) {
522               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
523             }
524             regionsToAssignAggregator.add(hri);
525           // TODO: The below else if is different in branch-1 from master branch.
526           } else if (rit != null) {
527             if ((rit.isPendingCloseOrClosing() || rit.isOffline())
528                 && am.getTableStateManager().isTableState(hri.getTable(),
529                 ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
530                 am.getReplicasToClose().contains(hri)) {
531               // If the table was partially disabled and the RS went down, we should clear the
532               // RIT and remove the node for the region.
533               // The rit that we use may be stale in case the table was in DISABLING state
534               // but though we did assign we will not be clearing the znode in CLOSING state.
535               // Doing this will have no harm. See HBASE-5927
536               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
537               am.deleteClosingOrClosedNode(hri, rit.getServerName());
538               am.offlineDisabledRegion(hri);
539             } else {
540               LOG.warn("THIS SHOULD NOT HAPPEN: unexpected region in transition "
541                 + rit + " not to be assigned by SSH of server " + serverName);
542             }
543           }
544         } finally {
545           lock.unlock();
546         }
547       }
548     }
549     return regionsToAssignAggregator;
550   }
551 
552   private boolean assign(final MasterProcedureEnv env, final List<HRegionInfo> hris)
553   throws InterruptedIOException {
554     AssignmentManager am = env.getMasterServices().getAssignmentManager();
555     // If the dead server already restarted, assign to the same server to preserve locality
556     boolean retainAssignment =
557         env.getMasterServices().getServerManager().isServerWithSameHostnamePortOnline(serverName) ?
558             true : false;
559     try {
560       if (retainAssignment) {
561         Map<HRegionInfo, ServerName> hriServerMap =
562             new HashMap<HRegionInfo, ServerName>(hris.size());
563         for (HRegionInfo hri: hris) {
564           hriServerMap.put(hri, serverName);
565         }
566         LOG.info("Best effort in SSH to retain assignment of " + hris.size()
567           + " regions from the dead server " + serverName);
568         am.assign(hriServerMap);
569       } else {
570         LOG.info("Using round robin in SSH to assign " + hris.size()
571           + " regions from the dead server " + serverName);
572         am.assign(hris);
573       }
574     } catch (InterruptedException ie) {
575       LOG.error("Caught " + ie + " during " + (retainAssignment ? "retaining" : "round-robin")
576         + " assignment");
577       throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
578     } catch (IOException ioe) {
579       LOG.warn("Caught " + ioe + " during region assignment, will retry");
580       return false;
581     }
582     return true;
583   }
584 
585   private boolean waitOnAssign(final MasterProcedureEnv env, final List<HRegionInfo> hris)
586   throws InterruptedIOException {
587     int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
588     for (HRegionInfo hri: hris) {
589       // TODO: Blocks here.
590       if (!waitOnRegionToClearRegionsInTransition(env.getMasterServices().getAssignmentManager(),
591           hri, timeout)) {
592         return false;
593       }
594     }
595     return true;
596   }
597 
598   @Override
599   protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
600   throws IOException {
601     // Can't rollback.
602     throw new UnsupportedOperationException("unhandled state=" + state);
603   }
604 
605   @Override
606   protected ServerCrashState getState(int stateId) {
607     return ServerCrashState.valueOf(stateId);
608   }
609 
610   @Override
611   protected int getStateId(ServerCrashState state) {
612     return state.getNumber();
613   }
614 
615   @Override
616   protected ServerCrashState getInitialState() {
617     return ServerCrashState.SERVER_CRASH_START;
618   }
619 
620   @Override
621   protected boolean abort(MasterProcedureEnv env) {
622     // TODO
623     return false;
624   }
625 
626   @Override
627   protected boolean acquireLock(final MasterProcedureEnv env) {
628     if (env.waitServerCrashProcessingEnabled(this)) return false;
629     return env.getProcedureQueue().tryAcquireServerExclusiveLock(this, getServerName());
630   }
631 
632   @Override
633   protected void releaseLock(final MasterProcedureEnv env) {
634     env.getProcedureQueue().releaseServerExclusiveLock(this, getServerName());
635   }
636 
637   @Override
638   public void toStringClassDetails(StringBuilder sb) {
639     sb.append(getClass().getSimpleName());
640     sb.append(" serverName=");
641     sb.append(this.serverName);
642     sb.append(", shouldSplitWal=");
643     sb.append(shouldSplitWal);
644     sb.append(", carryingMeta=");
645     sb.append(carryingMeta);
646   }
647 
648   @Override
649   public void serializeStateData(final OutputStream stream) throws IOException {
650     super.serializeStateData(stream);
651 
652     MasterProcedureProtos.ServerCrashStateData.Builder state =
653       MasterProcedureProtos.ServerCrashStateData.newBuilder().
654       setServerName(ProtobufUtil.toServerName(this.serverName)).
655       setDistributedLogReplay(this.distributedLogReplay).
656       setCarryingMeta(this.carryingMeta).
657       setShouldSplitWal(this.shouldSplitWal);
658     if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
659       for (HRegionInfo hri: this.regionsOnCrashedServer) {
660         state.addRegionsOnCrashedServer(HRegionInfo.convert(hri));
661       }
662     }
663     if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
664       for (HRegionInfo hri: this.regionsAssigned) {
665         state.addRegionsAssigned(HRegionInfo.convert(hri));
666       }
667     }
668     state.build().writeDelimitedTo(stream);
669   }
670 
671   @Override
672   public void deserializeStateData(final InputStream stream) throws IOException {
673     super.deserializeStateData(stream);
674 
675     MasterProcedureProtos.ServerCrashStateData state =
676       MasterProcedureProtos.ServerCrashStateData.parseDelimitedFrom(stream);
677     this.serverName = ProtobufUtil.toServerName(state.getServerName());
678     this.distributedLogReplay = state.hasDistributedLogReplay()?
679       state.getDistributedLogReplay(): false;
680     this.carryingMeta = state.hasCarryingMeta()? state.getCarryingMeta(): false;
681     // shouldSplitWAL has a default over in pb so this invocation will always work.
682     this.shouldSplitWal = state.getShouldSplitWal();
683     int size = state.getRegionsOnCrashedServerCount();
684     if (size > 0) {
685       this.regionsOnCrashedServer = new HashSet<HRegionInfo>(size);
686       for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
687         this.regionsOnCrashedServer.add(HRegionInfo.convert(ri));
688       }
689     }
690     size = state.getRegionsAssignedCount();
691     if (size > 0) {
692       this.regionsAssigned = new ArrayList<HRegionInfo>(size);
693       for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
694         this.regionsAssigned.add(HRegionInfo.convert(ri));
695       }
696     }
697   }
698 
699   /**
700    * Process a dead region from a dead RS. Checks if the region is disabled or
701    * disabling or if the region has a partially completed split.
702    * @param hri
703    * @param assignmentManager
704    * @return Returns true if specified region should be assigned, false if not.
705    * @throws IOException
706    */
707   private static boolean processDeadRegion(HRegionInfo hri, AssignmentManager assignmentManager)
708   throws IOException {
709     boolean tablePresent = assignmentManager.getTableStateManager().isTablePresent(hri.getTable());
710     if (!tablePresent) {
711       LOG.info("The table " + hri.getTable() + " was deleted.  Hence not proceeding.");
712       return false;
713     }
714     // If table is not disabled but the region is offlined,
715     boolean disabled = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
716       ZooKeeperProtos.Table.State.DISABLED);
717     if (disabled){
718       LOG.info("The table " + hri.getTable() + " was disabled.  Hence not proceeding.");
719       return false;
720     }
721     if (hri.isOffline() && hri.isSplit()) {
722       // HBASE-7721: Split parent and daughters are inserted into hbase:meta as an atomic operation.
723       // If the meta scanner saw the parent split, then it should see the daughters as assigned
724       // to the dead server. We don't have to do anything.
725       return false;
726     }
727     boolean disabling = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
728       ZooKeeperProtos.Table.State.DISABLING);
729     if (disabling) {
730       LOG.info("The table " + hri.getTable() + " is disabled.  Hence not assigning region" +
731         hri.getEncodedName());
732       return false;
733     }
734     return true;
735   }
736 
737   /**
738    * If hbase:meta is not assigned already, assign.
739    * @throws IOException
740    */
741   private void verifyAndAssignMetaWithRetries(final MasterProcedureEnv env) throws IOException {
742     MasterServices services = env.getMasterServices();
743     int iTimes = services.getConfiguration().getInt(KEY_RETRIES_ON_META, DEFAULT_RETRIES_ON_META);
744     // Just reuse same time as we have for short wait on meta. Adding another config is overkill.
745     long waitTime =
746       services.getConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
747     int iFlag = 0;
748     while (true) {
749       try {
750         verifyAndAssignMeta(env);
751         break;
752       } catch (KeeperException e) {
753         services.abort("In server shutdown processing, assigning meta", e);
754         throw new IOException("Aborting", e);
755       } catch (Exception e) {
756         if (iFlag >= iTimes) {
757           services.abort("verifyAndAssignMeta failed after" + iTimes + " retries, aborting", e);
758           throw new IOException("Aborting", e);
759         }
760         try {
761           Thread.sleep(waitTime);
762         } catch (InterruptedException e1) {
763           LOG.warn("Interrupted when is the thread sleep", e1);
764           Thread.currentThread().interrupt();
765           throw (InterruptedIOException)new InterruptedIOException().initCause(e1);
766         }
767         iFlag++;
768       }
769     }
770   }
771 
772   /**
773    * If hbase:meta is not assigned already, assign.
774    * @throws InterruptedException
775    * @throws IOException
776    * @throws KeeperException
777    */
778   private void verifyAndAssignMeta(final MasterProcedureEnv env)
779       throws InterruptedException, IOException, KeeperException {
780     MasterServices services = env.getMasterServices();
781     if (!isMetaAssignedQuickTest(env)) {
782       services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
783     } else if (serverName.equals(services.getMetaTableLocator().
784         getMetaRegionLocation(services.getZooKeeper()))) {
785       // hbase:meta seems to be still alive on the server whom master is expiring
786       // and thinks is dying. Let's re-assign the hbase:meta anyway.
787       services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
788     } else {
789       LOG.info("Skip assigning hbase:meta because it is online at "
790           + services.getMetaTableLocator().getMetaRegionLocation(services.getZooKeeper()));
791     }
792   }
793 
794   /**
795    * A quick test that hbase:meta is assigned; blocks for short time only.
796    * @return True if hbase:meta location is available and verified as good.
797    * @throws InterruptedException
798    * @throws IOException
799    */
800   private boolean isMetaAssignedQuickTest(final MasterProcedureEnv env)
801   throws InterruptedException, IOException {
802     ZooKeeperWatcher zkw = env.getMasterServices().getZooKeeper();
803     MetaTableLocator mtl = env.getMasterServices().getMetaTableLocator();
804     boolean metaAssigned = false;
805     // Is hbase:meta location available yet?
806     if (mtl.isLocationAvailable(zkw)) {
807       ClusterConnection connection = env.getMasterServices().getConnection();
808       // Is hbase:meta location good yet?
809       long timeout =
810         env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
811       if (mtl.verifyMetaRegionLocation(connection, zkw, timeout)) {
812         metaAssigned = true;
813       }
814     }
815     return metaAssigned;
816   }
817 
818   @Override
819   public ServerName getServerName() {
820     return this.serverName;
821   }
822 
823   @Override
824   public boolean hasMetaTableRegion() {
825     return this.carryingMeta;
826   }
827 
828   @Override
829   public ServerOperationType getServerOperationType() {
830     return ServerOperationType.CRASH_HANDLER;
831   }
832 
833   /**
834    * For this procedure, yield at end of each successful flow step so that all crashed servers
835    * can make progress rather than do the default which has each procedure running to completion
836    * before we move to the next. For crashed servers, especially if running with distributed log
837    * replay, we will want all servers to come along; we do not want the scenario where a server is
838    * stuck waiting for regions to online so it can replay edits.
839    */
840   @Override
841   protected boolean isYieldBeforeExecuteFromState(MasterProcedureEnv env, ServerCrashState state) {
842     return true;
843   }
844 
845   @Override
846   protected boolean shouldWaitClientAck(MasterProcedureEnv env) {
847     // The operation is triggered internally on the server
848     // the client does not know about this procedure.
849     return false;
850   }
851 }