1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.procedure;
19
20 import java.io.IOException;
21 import java.util.concurrent.Callable;
22 import java.util.concurrent.CountDownLatch;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.hbase.errorhandling.ForeignException;
27 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
28 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionListener;
29 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
30 import org.apache.hadoop.hbase.errorhandling.TimeoutExceptionInjector;
31 import org.apache.zookeeper.KeeperException;
32
33 /**
34 * Distributed procedure member's Subprocedure. A procedure is sarted on a ProcedureCoordinator
35 * which communicates with ProcedureMembers who create and start its part of the Procedure. This
36 * sub part is called a Subprocedure
37 *
38 * Users should subclass this and implement {@link #acquireBarrier()} (get local barrier for this
39 * member), {@link #insideBarrier()} (execute while globally barriered and release barrier) and
40 * {@link #cleanup(Exception)} (release state associated with subprocedure.)
41 *
42 * When submitted to a ProcedureMemeber, the call method is executed in a separate thread.
43 * Latches are use too block its progress and trigger continuations when barrier conditions are
44 * met.
45 *
46 * Exception that makes it out of calls to {@link #acquireBarrier()} or {@link #insideBarrier()}
47 * gets converted into {@link ForeignException}, which will get propagated to the
48 * {@link ProcedureCoordinator}.
49 *
50 * There is a category of procedure (ex: online-snapshots), and a user-specified instance-specific
51 * barrierName. (ex: snapshot121126).
52 */
53 abstract public class Subprocedure implements Callable<Void> {
54 private static final Log LOG = LogFactory.getLog(Subprocedure.class);
55
56 // Name of the procedure
57 final private String barrierName;
58
59 //
60 // Execution state
61 //
62
63 /** wait on before allowing the in barrier phase to proceed */
64 private final CountDownLatch inGlobalBarrier;
65 /** counted down when the Subprocedure has completed */
66 private final CountDownLatch releasedLocalBarrier;
67
68 //
69 // Error handling
70 //
71 /** monitor to check for errors */
72 protected final ForeignExceptionDispatcher monitor;
73 /** frequency to check for errors (ms) */
74 protected final long wakeFrequency;
75 protected final TimeoutExceptionInjector executionTimeoutTimer;
76 protected final ProcedureMemberRpcs rpcs;
77
78 private volatile boolean complete = false;
79
80 /**
81 * @param member reference to the member managing this subprocedure
82 * @param procName name of the procedure this subprocedure is associated with
83 * @param monitor notified if there is an error in the subprocedure
84 * @param wakeFrequency time in millis to wake to check if there is an error via the monitor (in
85 * milliseconds).
86 * @param timeout time in millis that will trigger a subprocedure abort if it has not completed
87 */
88 public Subprocedure(ProcedureMember member, String procName, ForeignExceptionDispatcher monitor,
89 long wakeFrequency, long timeout) {
90 // Asserts should be caught during unit testing
91 assert member != null : "procedure member should be non-null";
92 assert member.getRpcs() != null : "rpc handlers should be non-null";
93 assert procName != null : "procedure name should be non-null";
94 assert monitor != null : "monitor should be non-null";
95
96 // Default to a very large timeout
97 this.rpcs = member.getRpcs();
98 this.barrierName = procName;
99 this.monitor = monitor;
100 // forward any failures to coordinator. Since this is a dispatcher, resend loops should not be
101 // possible.
102 this.monitor.addListener(new ForeignExceptionListener() {
103 @Override
104 public void receive(ForeignException ee) {
105 // if this is a notification from a remote source, just log
106 if (ee.isRemote()) {
107 LOG.debug("Was remote foreign exception, not redispatching error", ee);
108 return;
109 }
110 // if this is a local KeeperException, don't attempt to notify other members
111 if (ee.getCause() instanceof KeeperException) {
112 LOG.debug("Was KeeperException, not redispatching error", ee);
113 return;
114 }
115 // if it is other local error, then send it to the coordinator
116 try {
117 rpcs.sendMemberAborted(Subprocedure.this, ee);
118 } catch (IOException e) {
119 // this will fail all the running procedures, since the connection is down
120 LOG.error("Can't reach controller, not propagating error", e);
121 }
122 }
123 });
124
125 this.wakeFrequency = wakeFrequency;
126 this.inGlobalBarrier = new CountDownLatch(1);
127 this.releasedLocalBarrier = new CountDownLatch(1);
128
129 // accept error from timer thread, this needs to be started.
130 this.executionTimeoutTimer = new TimeoutExceptionInjector(monitor, timeout);
131 }
132
133 public String getName() {
134 return barrierName;
135 }
136
137 public String getMemberName() {
138 return rpcs.getMemberName();
139 }
140
141 private void rethrowException() throws ForeignException {
142 monitor.rethrowException();
143 }
144
145 /**
146 * Execute the Subprocedure {@link #acquireBarrier()} and {@link #insideBarrier()} methods
147 * while keeping some state for other threads to access.
148 *
149 * This would normally be executed by the ProcedureMemeber when a acquire message comes from the
150 * coordinator. Rpcs are used to spend message back to the coordinator after different phases
151 * are executed. Any exceptions caught during the execution (except for InterruptedException) get
152 * converted and propagated to coordinator via {@link ProcedureMemberRpcs#sendMemberAborted(
153 * Subprocedure, ForeignException)}.
154 */
155 @SuppressWarnings("finally")
156 @Override
157 final public Void call() {
158 LOG.debug("Starting subprocedure '" + barrierName + "' with timeout " +
159 executionTimeoutTimer.getMaxTime() + "ms");
160 // start the execution timeout timer
161 executionTimeoutTimer.start();
162
163 try {
164 // start by checking for error first
165 rethrowException();
166 LOG.debug("Subprocedure '" + barrierName + "' starting 'acquire' stage");
167 acquireBarrier();
168 LOG.debug("Subprocedure '" + barrierName + "' locally acquired");
169 rethrowException();
170
171 // vote yes to coordinator about being prepared
172 rpcs.sendMemberAcquired(this);
173 LOG.debug("Subprocedure '" + barrierName + "' coordinator notified of 'acquire', waiting on" +
174 " 'reached' or 'abort' from coordinator");
175
176 // wait for the procedure to reach global barrier before proceding
177 waitForReachedGlobalBarrier();
178 rethrowException(); // if Coordinator aborts, will bail from here with exception
179
180 // In traditional 2PC, if a member reaches this state the TX has been committed and the
181 // member is responsible for rolling forward and recovering and completing the subsequent
182 // operations in the case of failure. It cannot rollback.
183 //
184 // This implementation is not 2PC since it can still rollback here, and thus has different
185 // semantics.
186
187 LOG.debug("Subprocedure '" + barrierName + "' received 'reached' from coordinator.");
188 byte[] dataToCoordinator = insideBarrier();
189 LOG.debug("Subprocedure '" + barrierName + "' locally completed");
190 rethrowException();
191
192 // Ack that the member has executed and released local barrier
193 rpcs.sendMemberCompleted(this, dataToCoordinator);
194 LOG.debug("Subprocedure '" + barrierName + "' has notified controller of completion");
195
196 // make sure we didn't get an external exception
197 rethrowException();
198 } catch (Exception e) {
199 String msg = null;
200 if (e instanceof InterruptedException) {
201 msg = "Procedure '" + barrierName + "' aborting due to interrupt!" +
202 " Likely due to pool shutdown.";
203 Thread.currentThread().interrupt();
204 } else if (e instanceof ForeignException) {
205 msg = "Subprocedure '" + barrierName + "' aborting due to a ForeignException!";
206 } else {
207 msg = "Subprocedure '" + barrierName + "' failed!";
208 }
209 cancel(msg, e);
210
211 LOG.debug("Subprocedure '" + barrierName + "' running cleanup.");
212 cleanup(e);
213 } finally {
214 releasedLocalBarrier.countDown();
215
216 // tell the timer we are done, if we get here successfully
217 executionTimeoutTimer.complete();
218 complete = true;
219 LOG.debug("Subprocedure '" + barrierName + "' completed.");
220 return null;
221 }
222 }
223
224 boolean isComplete() {
225 return complete;
226 }
227
228 /**
229 * exposed for testing.
230 */
231 ForeignExceptionSnare getErrorCheckable() {
232 return this.monitor;
233 }
234
235 /**
236 * The implementation of this method should gather and hold required resources (locks, disk
237 * space, etc) to satisfy the Procedures barrier condition. For example, this would be where
238 * to make all the regions on a RS on the quiescent for an procedure that required all regions
239 * to be globally quiesed.
240 *
241 * Users should override this method. If a quiescent is not required, this is overkill but
242 * can still be used to execute a procedure on all members and to propagate any exceptions.
243 *
244 * @throws ForeignException
245 */
246 abstract public void acquireBarrier() throws ForeignException;
247
248 /**
249 * The implementation of this method should act with the assumption that the barrier condition
250 * has been satisfied. Continuing the previous example, a condition could be that all RS's
251 * globally have been quiesced, and procedures that require this precondition could be
252 * implemented here.
253 * The implementation should also collect the result of the subprocedure as data to be returned
254 * to the coordinator upon successful completion.
255 * Users should override this method.
256 * @return the data the subprocedure wants to return to coordinator side.
257 * @throws ForeignException
258 */
259 abstract public byte[] insideBarrier() throws ForeignException;
260
261 /**
262 * Users should override this method. This implementation of this method should rollback and
263 * cleanup any temporary or partially completed state that the {@link #acquireBarrier()} may have
264 * created.
265 * @param e
266 */
267 abstract public void cleanup(Exception e);
268
269 /**
270 * Method to cancel the Subprocedure by injecting an exception from and external source.
271 * @param cause
272 */
273 public void cancel(String msg, Throwable cause) {
274 LOG.error(msg, cause);
275 complete = true;
276 if (cause instanceof ForeignException) {
277 monitor.receive((ForeignException) cause);
278 } else {
279 monitor.receive(new ForeignException(getMemberName(), cause));
280 }
281 }
282
283 /**
284 * Callback for the member rpcs to call when the global barrier has been reached. This
285 * unblocks the main subprocedure exectuion thread so that the Subprocedure's
286 * {@link #insideBarrier()} method can be run.
287 */
288 public void receiveReachedGlobalBarrier() {
289 inGlobalBarrier.countDown();
290 }
291
292 //
293 // Subprocedure Internal State interface
294 //
295
296 /**
297 * Wait for the reached global barrier notification.
298 *
299 * Package visibility for testing
300 *
301 * @throws ForeignException
302 * @throws InterruptedException
303 */
304 void waitForReachedGlobalBarrier() throws ForeignException, InterruptedException {
305 Procedure.waitForLatch(inGlobalBarrier, monitor, wakeFrequency,
306 barrierName + ":remote acquired");
307 }
308
309 /**
310 * Waits until the entire procedure has globally completed, or has been aborted.
311 * @throws ForeignException
312 * @throws InterruptedException
313 */
314 public void waitForLocallyCompleted() throws ForeignException, InterruptedException {
315 Procedure.waitForLatch(releasedLocalBarrier, monitor, wakeFrequency,
316 barrierName + ":completed");
317 }
318
319 /**
320 * Empty Subprocedure for testing.
321 *
322 * Must be public for stubbing used in testing to work.
323 */
324 public static class SubprocedureImpl extends Subprocedure {
325
326 public SubprocedureImpl(ProcedureMember member, String opName,
327 ForeignExceptionDispatcher monitor, long wakeFrequency, long timeout) {
328 super(member, opName, monitor, wakeFrequency, timeout);
329 }
330
331 @Override
332 public void acquireBarrier() throws ForeignException {}
333
334 @Override
335 public byte[] insideBarrier() throws ForeignException {
336 return new byte[0];
337 }
338
339 @Override
340 public void cleanup(Exception e) {}
341 };
342 }