1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.chaos.actions;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.LinkedList;
25 import java.util.List;
26 import java.util.Objects;
27 import java.util.Queue;
28
29 import org.apache.commons.lang.math.RandomUtils;
30 import org.apache.hadoop.hbase.ServerName;
31 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
32 import org.slf4j.Logger;
33 import org.slf4j.LoggerFactory;
34
35
36
37
38
39
40
41 public class RollingBatchRestartRsAction extends BatchRestartRsAction {
42 private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
43 protected int maxDeadServers;
44
45 public RollingBatchRestartRsAction(long sleepTime, float ratio) {
46 this(sleepTime, ratio, 5);
47 }
48
49 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
50 super(sleepTime, ratio);
51 this.maxDeadServers = maxDeadServers;
52 }
53
54 enum KillOrStart {
55 KILL,
56 START
57 }
58
59 @Override protected Logger getLogger() {
60 return LOG;
61 }
62
63 @Override
64 public void perform() throws Exception {
65 getLogger().info(
66 String.format("Performing action: Rolling batch restarting %d%% of region servers",
67 (int)(ratio * 100)));
68 List<ServerName> selectedServers = selectServers();
69
70 Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
71 LinkedList<ServerName> deadServers = new LinkedList<ServerName>();
72
73
74 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
75 KillOrStart action = KillOrStart.KILL;
76
77 if (serversToBeKilled.isEmpty()) {
78 action = KillOrStart.START;
79 } else if (deadServers.isEmpty()) {
80 action = KillOrStart.KILL;
81 } else if (deadServers.size() >= maxDeadServers) {
82
83 action = KillOrStart.START;
84 } else {
85
86 action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
87 }
88
89 ServerName server;
90
91 switch (action) {
92 case KILL:
93 server = serversToBeKilled.remove();
94 try {
95 killRs(server);
96 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
97
98
99 getLogger().info("Problem killing but presume successful; code=" + e.getExitCode(), e);
100 }
101 deadServers.add(server);
102 break;
103 case START:
104 server = Objects.requireNonNull(deadServers.peek());
105 try {
106 startRs(server);
107
108 deadServers.remove(server);
109 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
110
111
112 Collections.shuffle(deadServers);
113 getLogger().info(String.format(
114 "Problem starting %s, will retry; code=%s", server, e.getExitCode(), e));
115 }
116 break;
117 }
118
119 sleep(RandomUtils.nextInt((int)sleepTime));
120 }
121 }
122
123 protected List<ServerName> selectServers() throws IOException {
124 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
125 }
126
127
128
129
130
131
132 public static void main(final String[] args) throws Exception {
133 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
134 private int invocations = 0;
135 @Override
136 protected ServerName[] getCurrentServers() throws IOException {
137 final int count = 4;
138 List<ServerName> serverNames = new ArrayList<ServerName>(count);
139 for (int i = 0; i < 4; i++) {
140 serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
141 }
142 return serverNames.toArray(new ServerName[serverNames.size()]);
143 }
144
145 @Override
146 protected void killRs(ServerName server) throws IOException {
147 getLogger().info("Killed " + server);
148 if (this.invocations++ % 3 == 0) {
149 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
150 }
151 }
152
153 @Override
154 protected void startRs(ServerName server) throws IOException {
155 getLogger().info("Started " + server);
156 if (this.invocations++ % 3 == 0) {
157 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
158 }
159 }
160 };
161
162 action.perform();
163 }
164 }