View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.LinkedList;
25  import java.util.List;
26  import java.util.Objects;
27  import java.util.Queue;
28  
29  import org.apache.commons.lang.math.RandomUtils;
30  import org.apache.hadoop.hbase.ServerName;
31  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
32  import org.slf4j.Logger;
33  import org.slf4j.LoggerFactory;
34  
35  /**
36   * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
37   * server, or starts one, sleeping randomly (0-sleepTime) in between steps.
38   * The parameter maxDeadServers limits the maximum number of servers that
39   * can be down at the same time during rolling restarts.
40   */
41  public class RollingBatchRestartRsAction extends BatchRestartRsAction {
42    private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
43    protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
44  
45    public RollingBatchRestartRsAction(long sleepTime, float ratio) {
46      this(sleepTime, ratio, 5);
47    }
48  
49    public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
50      super(sleepTime, ratio);
51      this.maxDeadServers = maxDeadServers;
52    }
53  
54    enum KillOrStart {
55      KILL,
56      START
57    }
58  
59    @Override protected Logger getLogger() {
60      return LOG;
61    }
62  
63    @Override
64    public void perform() throws Exception {
65      getLogger().info(
66        String.format("Performing action: Rolling batch restarting %d%% of region servers",
67          (int)(ratio * 100)));
68      List<ServerName> selectedServers = selectServers();
69  
70      Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
71      LinkedList<ServerName> deadServers = new LinkedList<ServerName>();
72  
73      // loop while there are servers to be killed or dead servers to be restarted
74      while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
75        KillOrStart action = KillOrStart.KILL;
76  
77        if (serversToBeKilled.isEmpty()) { // no more servers to kill
78          action = KillOrStart.START;
79        } else if (deadServers.isEmpty()) {
80          action = KillOrStart.KILL; // no more servers to start
81        } else if (deadServers.size() >= maxDeadServers) {
82          // we have too many dead servers. Don't kill any more
83          action = KillOrStart.START;
84        } else {
85          // do a coin toss
86          action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
87        }
88  
89        ServerName server;
90  
91        switch (action) {
92          case KILL:
93            server = serversToBeKilled.remove();
94            try {
95              killRs(server);
96            } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
97              // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
98              // So, add to deadServers even if exception so the start gets called.
99              getLogger().info("Problem killing but presume successful; code=" + e.getExitCode(), e);
100           }
101           deadServers.add(server);
102           break;
103         case START:
104           server = Objects.requireNonNull(deadServers.peek());
105           try {
106             startRs(server);
107             // only remove the server from the known dead list if `startRs` succeeds.
108             deadServers.remove(server);
109           } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
110             // The start may fail but better to just keep going though we may lose server.
111             // Shuffle the dead list to avoid getting stuck on a single stubborn host.
112             Collections.shuffle(deadServers);
113             getLogger().info(String.format(
114               "Problem starting %s, will retry; code=%s", server, e.getExitCode(), e));
115           }
116           break;
117       }
118 
119       sleep(RandomUtils.nextInt((int)sleepTime));
120     }
121   }
122 
123   protected List<ServerName> selectServers() throws IOException {
124     return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
125   }
126 
127   /**
128    * Small test to ensure the class basically works.
129    * @param args
130    * @throws Exception
131    */
132   public static void main(final String[] args) throws Exception {
133     RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
134       private int invocations = 0;
135       @Override
136       protected ServerName[] getCurrentServers() throws IOException {
137         final int count = 4;
138         List<ServerName> serverNames = new ArrayList<ServerName>(count);
139         for (int i = 0; i < 4; i++) {
140           serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
141         }
142         return serverNames.toArray(new ServerName[serverNames.size()]);
143       }
144 
145       @Override
146       protected void killRs(ServerName server) throws IOException {
147         getLogger().info("Killed " + server);
148         if (this.invocations++ % 3 == 0) {
149           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
150         }
151       }
152 
153       @Override
154       protected void startRs(ServerName server) throws IOException {
155         getLogger().info("Started " + server);
156         if (this.invocations++ % 3 == 0) {
157           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
158         }
159       }
160     };
161 
162     action.perform();
163   }
164 }