View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.procedure;
19  
20  import java.io.IOException;
21  import java.util.concurrent.Callable;
22  import java.util.concurrent.CountDownLatch;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.errorhandling.ForeignException;
27  import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
28  import org.apache.hadoop.hbase.errorhandling.ForeignExceptionListener;
29  import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
30  import org.apache.hadoop.hbase.errorhandling.TimeoutExceptionInjector;
31  import org.apache.zookeeper.KeeperException;
32  
33  /**
34   * Distributed procedure member's Subprocedure.  A procedure is sarted on a ProcedureCoordinator
35   * which communicates with ProcedureMembers who create and start its part of the Procedure.  This
36   * sub part is called a Subprocedure
37   *
38   * Users should subclass this and implement {@link #acquireBarrier()} (get local barrier for this
39   * member), {@link #insideBarrier()} (execute while globally barriered and release barrier) and
40   * {@link #cleanup(Exception)} (release state associated with subprocedure.)
41   *
42   * When submitted to a ProcedureMemeber, the call method is executed in a separate thread.
43   * Latches are use too block its progress and trigger continuations when barrier conditions are
44   * met.
45   *
46   * Exception that makes it out of calls to {@link #acquireBarrier()} or {@link #insideBarrier()}
47   * gets converted into {@link ForeignException}, which will get propagated to the
48   * {@link ProcedureCoordinator}.
49   *
50   * There is a category of procedure (ex: online-snapshots), and a user-specified instance-specific
51   * barrierName. (ex: snapshot121126).
52   */
53  abstract public class Subprocedure implements Callable<Void> {
54    private static final Log LOG = LogFactory.getLog(Subprocedure.class);
55  
56    // Name of the procedure
57    final private String barrierName;
58  
59    //
60    // Execution state
61    //
62  
63    /** wait on before allowing the in barrier phase to proceed */
64    private final CountDownLatch inGlobalBarrier;
65    /** counted down when the Subprocedure has completed */
66    private final CountDownLatch releasedLocalBarrier;
67  
68    //
69    // Error handling
70    //
71    /** monitor to check for errors */
72    protected final ForeignExceptionDispatcher monitor;
73    /** frequency to check for errors (ms) */
74    protected final long wakeFrequency;
75    protected final TimeoutExceptionInjector executionTimeoutTimer;
76    protected final ProcedureMemberRpcs rpcs;
77  
78    private volatile boolean complete = false;
79  
80    /**
81     * @param member reference to the member managing this subprocedure
82     * @param procName name of the procedure this subprocedure is associated with
83     * @param monitor notified if there is an error in the subprocedure
84     * @param wakeFrequency time in millis to wake to check if there is an error via the monitor (in
85     *          milliseconds).
86     * @param timeout time in millis that will trigger a subprocedure abort if it has not completed
87     */
88    public Subprocedure(ProcedureMember member, String procName, ForeignExceptionDispatcher monitor,
89        long wakeFrequency, long timeout) {
90      // Asserts should be caught during unit testing
91      assert member != null : "procedure member should be non-null";
92      assert member.getRpcs() != null : "rpc handlers should be non-null";
93      assert procName != null : "procedure name should be non-null";
94      assert monitor != null : "monitor should be non-null";
95  
96      // Default to a very large timeout
97      this.rpcs = member.getRpcs();
98      this.barrierName = procName;
99      this.monitor = monitor;
100     // forward any failures to coordinator.  Since this is a dispatcher, resend loops should not be
101     // possible.
102     this.monitor.addListener(new ForeignExceptionListener() {
103       @Override
104       public void receive(ForeignException ee) {
105         // if this is a notification from a remote source, just log
106         if (ee.isRemote()) {
107           LOG.debug("Was remote foreign exception, not redispatching error", ee);
108           return;
109         }
110         // if this is a local KeeperException, don't attempt to notify other members
111         if (ee.getCause() instanceof KeeperException) {
112           LOG.debug("Was KeeperException, not redispatching error", ee);
113           return;
114         }
115         // if it is other local error, then send it to the coordinator
116         try {
117           rpcs.sendMemberAborted(Subprocedure.this, ee);
118         } catch (IOException e) {
119           // this will fail all the running procedures, since the connection is down
120           LOG.error("Can't reach controller, not propagating error", e);
121         }
122       }
123     });
124 
125     this.wakeFrequency = wakeFrequency;
126     this.inGlobalBarrier = new CountDownLatch(1);
127     this.releasedLocalBarrier = new CountDownLatch(1);
128 
129     // accept error from timer thread, this needs to be started.
130     this.executionTimeoutTimer = new TimeoutExceptionInjector(monitor, timeout);
131   }
132 
133   public String getName() {
134      return barrierName;
135   }
136 
137   public String getMemberName() {
138     return rpcs.getMemberName();
139   }
140 
141   private void rethrowException() throws ForeignException {
142     monitor.rethrowException();
143   }
144 
145   /**
146    * Execute the Subprocedure {@link #acquireBarrier()} and {@link #insideBarrier()} methods
147    * while keeping some state for other threads to access.
148    *
149    * This would normally be executed by the ProcedureMemeber when a acquire message comes from the
150    * coordinator.  Rpcs are used to spend message back to the coordinator after different phases
151    * are executed.  Any exceptions caught during the execution (except for InterruptedException) get
152    * converted and propagated to coordinator via {@link ProcedureMemberRpcs#sendMemberAborted(
153    * Subprocedure, ForeignException)}.
154    */
155   @SuppressWarnings("finally")
156   @Override
157   final public Void call() {
158     LOG.debug("Starting subprocedure '" + barrierName + "' with timeout " +
159         executionTimeoutTimer.getMaxTime() + "ms");
160     // start the execution timeout timer
161     executionTimeoutTimer.start();
162 
163     try {
164       // start by checking for error first
165       rethrowException();
166       LOG.debug("Subprocedure '" + barrierName + "' starting 'acquire' stage");
167       acquireBarrier();
168       LOG.debug("Subprocedure '" + barrierName + "' locally acquired");
169       rethrowException();
170 
171       // vote yes to coordinator about being prepared
172       rpcs.sendMemberAcquired(this);
173       LOG.debug("Subprocedure '" + barrierName + "' coordinator notified of 'acquire', waiting on" +
174           " 'reached' or 'abort' from coordinator");
175 
176       // wait for the procedure to reach global barrier before proceding
177       waitForReachedGlobalBarrier();
178       rethrowException(); // if Coordinator aborts, will bail from here with exception
179 
180       // In traditional 2PC, if a member reaches this state the TX has been committed and the
181       // member is responsible for rolling forward and recovering and completing the subsequent
182       // operations in the case of failure.  It cannot rollback.
183       //
184       // This implementation is not 2PC since it can still rollback here, and thus has different
185       // semantics.
186 
187       LOG.debug("Subprocedure '" + barrierName + "' received 'reached' from coordinator.");
188       byte[] dataToCoordinator = insideBarrier();
189       LOG.debug("Subprocedure '" + barrierName + "' locally completed");
190       rethrowException();
191 
192       // Ack that the member has executed and released local barrier
193       rpcs.sendMemberCompleted(this, dataToCoordinator);
194       LOG.debug("Subprocedure '" + barrierName + "' has notified controller of completion");
195 
196       // make sure we didn't get an external exception
197       rethrowException();
198     } catch (Exception e) {
199       String msg = null;
200       if (e instanceof InterruptedException) {
201         msg = "Procedure '" + barrierName + "' aborting due to interrupt!" +
202             " Likely due to pool shutdown.";
203         Thread.currentThread().interrupt();
204       } else if (e instanceof ForeignException) {
205         msg = "Subprocedure '" + barrierName + "' aborting due to a ForeignException!";
206       } else {
207         msg = "Subprocedure '" + barrierName + "' failed!";
208       }
209       cancel(msg, e);
210 
211       LOG.debug("Subprocedure '" + barrierName + "' running cleanup.");
212       cleanup(e);
213     } finally {
214       releasedLocalBarrier.countDown();
215 
216       // tell the timer we are done, if we get here successfully
217       executionTimeoutTimer.complete();
218       complete = true;
219       LOG.debug("Subprocedure '" + barrierName + "' completed.");
220       return null;
221     }
222   }
223 
224   boolean isComplete() {
225     return complete;
226   }
227 
228   /**
229    * exposed for testing.
230    */
231   ForeignExceptionSnare getErrorCheckable() {
232     return this.monitor;
233   }
234 
235   /**
236    * The implementation of this method should gather and hold required resources (locks, disk
237    * space, etc) to satisfy the Procedures barrier condition.  For example, this would be where
238    * to make all the regions on a RS on the quiescent for an procedure that required all regions
239    * to be globally quiesed.
240    *
241    * Users should override this method.  If a quiescent is not required, this is overkill but
242    * can still be used to execute a procedure on all members and to propagate any exceptions.
243    *
244    * @throws ForeignException
245    */
246   abstract public void acquireBarrier() throws ForeignException;
247 
248   /**
249    * The implementation of this method should act with the assumption that the barrier condition
250    * has been satisfied.  Continuing the previous example, a condition could be that all RS's
251    * globally have been quiesced, and procedures that require this precondition could be
252    * implemented here.
253    * The implementation should also collect the result of the subprocedure as data to be returned
254    * to the coordinator upon successful completion.
255    * Users should override this method.
256    * @return the data the subprocedure wants to return to coordinator side.
257    * @throws ForeignException
258    */
259   abstract public byte[] insideBarrier() throws ForeignException;
260 
261   /**
262    * Users should override this method. This implementation of this method should rollback and
263    * cleanup any temporary or partially completed state that the {@link #acquireBarrier()} may have
264    * created.
265    * @param e
266    */
267   abstract public void cleanup(Exception e);
268 
269   /**
270    * Method to cancel the Subprocedure by injecting an exception from and external source.
271    * @param cause
272    */
273   public void cancel(String msg, Throwable cause) {
274     LOG.error(msg, cause);
275     complete = true;
276     if (cause instanceof ForeignException) {
277       monitor.receive((ForeignException) cause);
278     } else {
279       monitor.receive(new ForeignException(getMemberName(), cause));
280     }
281   }
282 
283   /**
284    * Callback for the member rpcs to call when the global barrier has been reached.  This
285    * unblocks the main subprocedure exectuion thread so that the Subprocedure's
286    * {@link #insideBarrier()} method can be run.
287    */
288   public void receiveReachedGlobalBarrier() {
289     inGlobalBarrier.countDown();
290   }
291 
292   //
293   // Subprocedure Internal State interface
294   //
295 
296   /**
297    * Wait for the reached global barrier notification.
298    *
299    * Package visibility for testing
300    *
301    * @throws ForeignException
302    * @throws InterruptedException
303    */
304   void waitForReachedGlobalBarrier() throws ForeignException, InterruptedException {
305     Procedure.waitForLatch(inGlobalBarrier, monitor, wakeFrequency,
306         barrierName + ":remote acquired");
307   }
308 
309   /**
310    * Waits until the entire procedure has globally completed, or has been aborted.
311    * @throws ForeignException
312    * @throws InterruptedException
313    */
314   public void waitForLocallyCompleted() throws ForeignException, InterruptedException {
315     Procedure.waitForLatch(releasedLocalBarrier, monitor, wakeFrequency,
316         barrierName + ":completed");
317   }
318 
319   /**
320    * Empty Subprocedure for testing.
321    *
322    * Must be public for stubbing used in testing to work.
323    */
324   public static class SubprocedureImpl extends Subprocedure {
325 
326     public SubprocedureImpl(ProcedureMember member, String opName,
327         ForeignExceptionDispatcher monitor, long wakeFrequency, long timeout) {
328       super(member, opName, monitor, wakeFrequency, timeout);
329     }
330 
331     @Override
332     public void acquireBarrier() throws ForeignException {}
333 
334     @Override
335     public byte[] insideBarrier() throws ForeignException {
336       return new byte[0];
337     }
338 
339     @Override
340     public void cleanup(Exception e) {}
341   };
342 }