View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.util.ArrayList;
22  import java.util.HashSet;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.Set;
26  
27  import org.apache.commons.lang.math.RandomUtils;
28  import org.apache.hadoop.hbase.ClusterStatus;
29  import org.apache.hadoop.hbase.ServerName;
30  import org.junit.Assert;
31  import org.slf4j.Logger;
32  import org.slf4j.LoggerFactory;
33  
34  /** This action is too specific to put in ChaosMonkey; put it here */
35  public class UnbalanceKillAndRebalanceAction extends Action {
36    private static final Logger LOG =
37        LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class);
38    /** Fractions of servers to get regions and live and die respectively; from all other
39     * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
40    private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
41    private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
42    private static final double HOARD_FRC_OF_REGIONS = 0.8;
43    /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
44     * and restarting the servers; to make sure these events have time to impact the cluster. */
45    private long waitForUnbalanceMilliSec;
46    private long waitForKillsMilliSec;
47    private long waitAfterBalanceMilliSec;
48    private boolean killMetaRs;
49  
50    public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
51        boolean killMetaRs) {
52      super();
53      waitForUnbalanceMilliSec = waitUnbalance;
54      waitForKillsMilliSec = waitKill;
55      waitAfterBalanceMilliSec = waitAfterBalance;
56      this.killMetaRs = killMetaRs;
57    }
58  
59    @Override protected Logger getLogger() {
60      return LOG;
61    }
62  
63    @Override
64    public void perform() throws Exception {
65      ClusterStatus status = this.cluster.getClusterStatus();
66      List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
67      Set<ServerName> killedServers = new HashSet<ServerName>();
68  
69      int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
70      int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
71      Assert.assertTrue(
72          "There are not enough victim servers: " + victimServers.size(),
73          liveCount + deadCount < victimServers.size());
74      List<ServerName> targetServers = new ArrayList<ServerName>(liveCount);
75      for (int i = 0; i < liveCount + deadCount; ++i) {
76        int victimIx = RandomUtils.nextInt(victimServers.size());
77        targetServers.add(victimServers.remove(victimIx));
78      }
79      unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
80      Thread.sleep(waitForUnbalanceMilliSec);
81      ServerName metaServer = cluster.getServerHoldingMeta();
82      for (ServerName targetServer: targetServers) {
83        // Don't keep killing servers if we're
84        // trying to stop the monkey.
85        if (context.isStopping()) {
86          break;
87        }
88        if (killedServers.size() >= liveCount) {
89          break;
90        }
91  
92        if (!killMetaRs && targetServer.equals(metaServer)) {
93          getLogger().info("Not killing server because it holds hbase:meta.");
94        } else {
95          killRs(targetServer);
96          killedServers.add(targetServer);
97        }
98      }
99  
100     Thread.sleep(waitForKillsMilliSec);
101     forceBalancer();
102     Thread.sleep(waitAfterBalanceMilliSec);
103     for (ServerName server:killedServers) {
104       startRs(server);
105     }
106   }
107 }