1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.chaos.actions;
20
21 import java.io.IOException;
22 import java.util.LinkedList;
23 import java.util.List;
24 import java.util.Queue;
25
26 import org.apache.commons.lang.math.RandomUtils;
27 import org.apache.hadoop.hbase.ServerName;
28 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
29 import org.apache.hadoop.hbase.util.Threads;
30 import org.apache.hadoop.util.Shell;
31 import org.slf4j.Logger;
32 import org.slf4j.LoggerFactory;
33
34
35
36
37
38
39
40 public class RollingBatchSuspendResumeRsAction extends Action {
41 private static final Logger LOG =
42 LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
43 private float ratio;
44 private long sleepTime;
45 private int maxSuspendedServers;
46
47 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
48 this(sleepTime, ratio, 5);
49 }
50
51 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
52 this.ratio = ratio;
53 this.sleepTime = sleepTime;
54 this.maxSuspendedServers = maxSuspendedServers;
55 }
56
57 enum SuspendOrResume {
58 SUSPEND, RESUME
59 }
60
61 @Override protected Logger getLogger() {
62 return LOG;
63 }
64
65 @Override
66 public void perform() throws Exception {
67 getLogger().info(
68 String.format("Performing action: Rolling batch restarting %d%% of region servers",
69 (int) (ratio * 100)));
70 List<ServerName> selectedServers = selectServers();
71
72 Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
73 Queue<ServerName> suspendedServers = new LinkedList<>();
74
75
76 while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
77 .isStopping()) {
78 SuspendOrResume action;
79
80 if (serversToBeSuspended.isEmpty()) {
81 action = SuspendOrResume.RESUME;
82 } else if (suspendedServers.isEmpty()) {
83 action = SuspendOrResume.SUSPEND;
84 } else if (suspendedServers.size() >= maxSuspendedServers) {
85
86 action = SuspendOrResume.RESUME;
87 } else {
88
89 action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
90 }
91
92 ServerName server;
93 switch (action) {
94 case SUSPEND:
95 server = serversToBeSuspended.remove();
96 try {
97 suspendRs(server);
98 } catch (Shell.ExitCodeException e) {
99 getLogger().warn("Problem suspending but presume successful; code="
100 + e.getExitCode(), e);
101 }
102 suspendedServers.add(server);
103 break;
104 case RESUME:
105 server = suspendedServers.remove();
106 try {
107 resumeRs(server);
108 } catch (Shell.ExitCodeException e) {
109 getLogger().info("Problem resuming, will retry; code= " + e.getExitCode(), e);
110 }
111 break;
112 default:
113 throw new IllegalArgumentException(
114 "Encountered unexpected action type: " + action.name());
115 }
116
117 getLogger().info("Sleeping for: " + sleepTime);
118 Threads.sleep(sleepTime);
119 }
120 }
121
122 protected List<ServerName> selectServers() throws IOException {
123 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
124 }
125
126 }