View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional information regarding
4    * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
5    * "License"); you may not use this file except in compliance with the License. You may obtain a
6    * copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
7    * law or agreed to in writing, software distributed under the License is distributed on an "AS IS"
8    * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License
9    * for the specific language governing permissions and limitations under the License.
10   */
11  
12  package org.apache.hadoop.hbase.coordination;
13  
14  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
15  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
16  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
17  
18  import java.io.IOException;
19  import java.util.List;
20  
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  import org.apache.hadoop.hbase.CoordinatedStateManager;
24  import org.apache.hadoop.hbase.HRegionInfo;
25  import org.apache.hadoop.hbase.RegionTransition;
26  import org.apache.hadoop.hbase.ServerName;
27  import org.apache.hadoop.hbase.coordination.SplitTransactionCoordination;
28  import org.apache.hadoop.hbase.executor.EventType;
29  import org.apache.hadoop.hbase.regionserver.HRegion;
30  import org.apache.hadoop.hbase.regionserver.Region;
31  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
32  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
33  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
34  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
35  import org.apache.zookeeper.KeeperException;
36  import org.apache.zookeeper.data.Stat;
37  
38  public class ZKSplitTransactionCoordination implements SplitTransactionCoordination {
39  
40    private CoordinatedStateManager coordinationManager;
41    private final ZooKeeperWatcher watcher;
42  
43    // max wait for split transaction - 100 times in a loop with 100 ms of thread sleep each time
44    // this accounts for ~24 s due to calls involved in loop. even for busy cluster, by this time,
45    // we should have been able to complete setData() In fact, ideally, 2nd retry after failed
46    // attempt should be sufficient to retrieve correct ZK node version and successfully updating
47    // RIT info in ZK node.
48    private static final int SPIN_WAIT_TIMEOUT = 100;
49  
50    private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class);
51  
52    public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider,
53        ZooKeeperWatcher watcher) {
54      this.coordinationManager = coordinationProvider;
55      this.watcher = watcher;
56    }
57  
58    /**
59     * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region. Create it
60     * ephemeral in case regionserver dies mid-split.
61     * <p>
62     * Does not transition nodes from other states. If a node already exists for this region, an
63     * Exception will be thrown.
64     * @param parent region to be created as offline
65     * @param serverName server event originates from
66     * @param hri_a daughter region
67     * @param hri_b daughter region
68     * @throws IOException
69     */
70  
71    @Override
72    public void startSplitTransaction(HRegion parent, ServerName serverName, HRegionInfo hri_a,
73        HRegionInfo hri_b) throws IOException {
74  
75      HRegionInfo region = parent.getRegionInfo();
76      try {
77  
78        LOG.debug(watcher.prefix("Creating ephemeral node for " + region.getEncodedName()
79            + " in PENDING_SPLIT state"));
80        byte[] payload = HRegionInfo.toDelimitedByteArray(hri_a, hri_b);
81        RegionTransition rt =
82            RegionTransition.createRegionTransition(RS_ZK_REQUEST_REGION_SPLIT,
83              region.getRegionName(), serverName, payload);
84        String node = ZKAssign.getNodeName(watcher, region.getEncodedName());
85        if (!ZKUtil.createEphemeralNodeAndWatch(watcher, node, rt.toByteArray())) {
86          throw new IOException("Failed create of ephemeral " + node);
87        }
88  
89      } catch (KeeperException e) {
90        throw new IOException("Failed creating PENDING_SPLIT znode on "
91            + parent.getRegionInfo().getRegionNameAsString(), e);
92      }
93  
94    }
95  
96    /**
97     * Transitions an existing ephemeral node for the specified region which is currently in the begin
98     * state to be in the end state. Master cleans up the final SPLIT znode when it reads it (or if we
99     * crash, zk will clean it up).
100    * <p>
101    * Does not transition nodes from other states. If for some reason the node could not be
102    * transitioned, the method returns -1. If the transition is successful, the version of the node
103    * after transition is returned.
104    * <p>
105    * This method can fail and return false for three different reasons:
106    * <ul>
107    * <li>Node for this region does not exist</li>
108    * <li>Node for this region is not in the begin state</li>
109    * <li>After verifying the begin state, update fails because of wrong version (this should never
110    * actually happen since an RS only does this transition following a transition to the begin
111    * state. If two RS are conflicting, one would fail the original transition to the begin state and
112    * not this transition)</li>
113    * </ul>
114    * <p>
115    * Does not set any watches.
116    * <p>
117    * This method should only be used by a RegionServer when splitting a region.
118    * @param parent region to be transitioned to opened
119    * @param a Daughter a of split
120    * @param b Daughter b of split
121    * @param serverName server event originates from
122    * @param std split transaction details
123    * @param beginState the expected current state the znode should be
124    * @param endState the state to be transition to
125    * @return version of node after transition, -1 if unsuccessful transition
126    * @throws IOException
127    */
128 
129   private int transitionSplittingNode(HRegionInfo parent, HRegionInfo a, HRegionInfo b,
130       ServerName serverName, SplitTransactionDetails std, final EventType beginState,
131       final EventType endState) throws IOException {
132     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) std;
133     byte[] payload = HRegionInfo.toDelimitedByteArray(a, b);
134     try {
135       return ZKAssign.transitionNode(watcher, parent, serverName, beginState, endState,
136         zstd.getZnodeVersion(), payload);
137     } catch (KeeperException e) {
138       throw new IOException(
139           "Failed transition of splitting node " + parent.getRegionNameAsString(), e);
140     }
141   }
142 
143   /**
144    * Wait for the splitting node to be transitioned from pending_split to splitting by master.
145    * That's how we are sure master has processed the event and is good with us to move on. If we
146    * don't get any update, we periodically transition the node so that master gets the callback. If
147    * the node is removed or is not in pending_split state any more, we abort the split.
148    */
149   @Override
150   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION",
151     justification="Intended")
152   public void waitForSplitTransaction(final RegionServerServices services, Region parent,
153       HRegionInfo hri_a, HRegionInfo hri_b, SplitTransactionDetails sptd) throws IOException {
154     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) sptd;
155 
156     // After creating the split node, wait for master to transition it
157     // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
158     // knows about it and won't transition any region which is splitting.
159     try {
160       int spins = 0;
161       Stat stat = new Stat();
162       ServerName expectedServer = coordinationManager.getServer().getServerName();
163       String node = parent.getRegionInfo().getEncodedName();
164       while (!(coordinationManager.getServer().isStopped() || services.isStopping())) {
165         if (spins % 5 == 0) {
166           LOG.debug("Still waiting for master to process " + "the pending_split for " + node);
167           SplitTransactionDetails temp = getDefaultDetails();
168           transitionSplittingNode(parent.getRegionInfo(), hri_a, hri_b, expectedServer, temp,
169             RS_ZK_REQUEST_REGION_SPLIT, RS_ZK_REQUEST_REGION_SPLIT);
170         }
171         Thread.sleep(100);
172         spins++;
173         if (spins > SPIN_WAIT_TIMEOUT) {
174           throw new IOException("Waiting time for Split Transaction exceeded for region: "
175             + parent.getRegionInfo().getRegionNameAsString());
176         }
177         byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat);
178         if (data == null) {
179           throw new IOException("Data is null, splitting node " + node + " no longer exists");
180         }
181         RegionTransition rt = RegionTransition.parseFrom(data);
182         EventType et = rt.getEventType();
183         if (et == RS_ZK_REGION_SPLITTING) {
184           ServerName serverName = rt.getServerName();
185           if (!serverName.equals(expectedServer)) {
186             throw new IOException("Splitting node " + node + " is for " + serverName + ", not us "
187                 + expectedServer);
188           }
189           byte[] payloadOfSplitting = rt.getPayload();
190           List<HRegionInfo> splittingRegions =
191               HRegionInfo.parseDelimitedFrom(payloadOfSplitting, 0, payloadOfSplitting.length);
192           assert splittingRegions.size() == 2;
193           HRegionInfo a = splittingRegions.get(0);
194           HRegionInfo b = splittingRegions.get(1);
195           if (!(hri_a.equals(a) && hri_b.equals(b))) {
196             throw new IOException("Splitting node " + node + " is for " + a + ", " + b
197                 + ", not expected daughters: " + hri_a + ", " + hri_b);
198           }
199           // Master has processed it.
200           zstd.setZnodeVersion(stat.getVersion());
201           return;
202         }
203         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
204           throw new IOException("Splitting node " + node + " moved out of splitting to " + et);
205         }
206       }
207       // Server is stopping/stopped
208       throw new IOException("Server is " + (services.isStopping() ? "stopping" : "stopped"));
209     } catch (Exception e) {
210       if (e instanceof InterruptedException) {
211         Thread.currentThread().interrupt();
212       }
213       throw new IOException("Failed getting SPLITTING znode on " +
214         parent.getRegionInfo().getRegionNameAsString(), e);
215     }
216   }
217 
218   /**
219    * Finish off split transaction, transition the zknode
220    * @param services Used to online/offline regions.
221    * @param a daughter region
222    * @param b daughter region
223    * @param std split transaction details
224    * @param parent
225    * @throws IOException If thrown, transaction failed. Call
226    *  {@link org.apache.hadoop.hbase.regionserver.SplitTransaction#rollback(
227    *  Server, RegionServerServices)}
228    */
229   @Override
230   public void completeSplitTransaction(final RegionServerServices services, Region a, Region b,
231       SplitTransactionDetails std, Region parent) throws IOException {
232     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) std;
233     // Tell master about split by updating zk. If we fail, abort.
234     if (coordinationManager.getServer() != null) {
235       try {
236         int newNodeVersion = transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
237           b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
238           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
239         if (newNodeVersion == -1) {
240           throw new IOException("Notifying master of RS split failed for region: "
241             + parent.getRegionInfo().getRegionNameAsString());
242         }
243         zstd.setZnodeVersion(newNodeVersion);
244 
245         int spins = 0;
246         // Now wait for the master to process the split. We know it's done
247         // when the znode is deleted. The reason we keep tickling the znode is
248         // that it's possible for the master to miss an event.
249         do {
250           if (spins % 10 == 0) {
251             LOG.debug("Still waiting on the master to process the split for "
252                 + parent.getRegionInfo().getEncodedName());
253           }
254           Thread.sleep(100);
255           // When this returns -1 it means the znode doesn't exist
256           zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
257             b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
258             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT));
259           spins++;
260         } while (zstd.getZnodeVersion() != -1 && !coordinationManager.getServer().isStopped()
261             && !services.isStopping());
262       } catch (Exception e) {
263         if (e instanceof InterruptedException) {
264           Thread.currentThread().interrupt();
265         }
266         throw new IOException("Failed telling master about split", e);
267       }
268     }
269 
270     // Leaving here, the splitdir with its dross will be in place but since the
271     // split was successful, just leave it; it'll be cleaned when parent is
272     // deleted and cleaned up.
273   }
274 
275   @Override
276   public void clean(final HRegionInfo hri) {
277     try {
278       // Only delete if its in expected state; could have been hijacked.
279       if (!ZKAssign.deleteNode(coordinationManager.getServer().getZooKeeper(),
280         hri.getEncodedName(), RS_ZK_REQUEST_REGION_SPLIT, coordinationManager.getServer()
281             .getServerName())) {
282         ZKAssign.deleteNode(coordinationManager.getServer().getZooKeeper(), hri.getEncodedName(),
283           RS_ZK_REGION_SPLITTING, coordinationManager.getServer().getServerName());
284       }
285     } catch (KeeperException.NoNodeException e) {
286       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
287     } catch (KeeperException e) {
288       coordinationManager.getServer().abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
289     }
290   }
291 
292   /**
293    * ZK-based implementation. Has details about whether the state transition should be reflected in
294    * ZK, as well as expected version of znode.
295    */
296   public static class ZkSplitTransactionDetails implements
297       SplitTransactionCoordination.SplitTransactionDetails {
298     private int znodeVersion;
299 
300     public ZkSplitTransactionDetails() {
301     }
302 
303     /**
304      * @return znode current version
305      */
306     public int getZnodeVersion() {
307       return znodeVersion;
308     }
309 
310     /**
311      * @param znodeVersion znode new version
312      */
313     public void setZnodeVersion(int znodeVersion) {
314       this.znodeVersion = znodeVersion;
315     }
316   }
317 
318   @Override
319   public SplitTransactionDetails getDefaultDetails() {
320     ZkSplitTransactionDetails zstd = new ZkSplitTransactionDetails();
321     zstd.setZnodeVersion(-1);
322     return zstd;
323   }
324 
325   @Override
326   public int processTransition(HRegionInfo p, HRegionInfo hri_a, HRegionInfo hri_b, ServerName sn,
327       SplitTransactionDetails std) throws IOException {
328     return transitionSplittingNode(p, hri_a, hri_b, sn, std, RS_ZK_REQUEST_REGION_SPLIT,
329       RS_ZK_REGION_SPLITTING);
330 
331   }
332 }