View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.LinkedList;
23  import java.util.List;
24  import java.util.Queue;
25  
26  import org.apache.commons.lang.math.RandomUtils;
27  import org.apache.hadoop.hbase.ServerName;
28  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
29  import org.apache.hadoop.hbase.util.Threads;
30  import org.apache.hadoop.util.Shell;
31  import org.slf4j.Logger;
32  import org.slf4j.LoggerFactory;
33  
34  /**
35   * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
36   * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
37   * maxSuspendedServers limits the maximum number of servers that can be down at the same time
38   * during rolling restarts.
39   */
40  public class RollingBatchSuspendResumeRsAction extends Action {
41    private static final Logger LOG =
42        LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
43    private float ratio;
44    private long sleepTime;
45    private int maxSuspendedServers; // number of maximum suspended servers at any given time.
46  
47    public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
48      this(sleepTime, ratio, 5);
49    }
50  
51    public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
52      this.ratio = ratio;
53      this.sleepTime = sleepTime;
54      this.maxSuspendedServers = maxSuspendedServers;
55    }
56  
57    enum SuspendOrResume {
58      SUSPEND, RESUME
59    }
60  
61    @Override protected Logger getLogger() {
62      return LOG;
63    }
64  
65    @Override
66    public void perform() throws Exception {
67      getLogger().info(
68        String.format("Performing action: Rolling batch restarting %d%% of region servers",
69          (int) (ratio * 100)));
70      List<ServerName> selectedServers = selectServers();
71  
72      Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
73      Queue<ServerName> suspendedServers = new LinkedList<>();
74  
75      // loop while there are servers to be suspended or suspended servers to be resumed
76      while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
77          .isStopping()) {
78        SuspendOrResume action;
79  
80        if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
81          action = SuspendOrResume.RESUME;
82        } else if (suspendedServers.isEmpty()) {
83          action = SuspendOrResume.SUSPEND; // no more servers to resume
84        } else if (suspendedServers.size() >= maxSuspendedServers) {
85          // we have too many suspended servers. Don't suspend any more
86          action = SuspendOrResume.RESUME;
87        } else {
88          // do a coin toss
89          action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
90        }
91  
92        ServerName server;
93        switch (action) {
94          case SUSPEND:
95            server = serversToBeSuspended.remove();
96            try {
97              suspendRs(server);
98            } catch (Shell.ExitCodeException e) {
99              getLogger().warn("Problem suspending but presume successful; code="
100               + e.getExitCode(), e);
101           }
102           suspendedServers.add(server);
103           break;
104         case RESUME:
105           server = suspendedServers.remove();
106           try {
107             resumeRs(server);
108           } catch (Shell.ExitCodeException e) {
109             getLogger().info("Problem resuming, will retry; code= " + e.getExitCode(), e);
110           }
111           break;
112         default:
113           throw new IllegalArgumentException(
114               "Encountered unexpected action type: " + action.name());
115       }
116 
117       getLogger().info("Sleeping for: " + sleepTime);
118       Threads.sleep(sleepTime);
119     }
120   }
121 
122   protected List<ServerName> selectServers() throws IOException {
123     return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
124   }
125 
126 }