View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.regionserver.wal;
19  
20  import java.util.ArrayList;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.concurrent.ConcurrentHashMap;
27  import java.util.concurrent.ConcurrentMap;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.HRegionInfo;
33  import org.apache.hadoop.hbase.classification.InterfaceAudience;
34  import org.apache.hadoop.hbase.util.Bytes;
35  import org.apache.hadoop.hbase.util.ImmutableByteArray;
36  
37  /**
38   * Accounting of sequence ids per region and then by column family. So we can our accounting
39   * current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance can
40   * keep abreast of the state of sequence id persistence. Also call update per append.
41   * <p>
42   * For the implementation, we assume that all the {@code encodedRegionName} passed in is gotten by
43   * {@link HRegionInfo#getEncodedNameAsBytes()}. So it is safe to use it as a hash key. And for
44   * family name, we use {@link ImmutableByteArray} as key. This is because hash based map is much
45   * faster than RBTree or CSLM and here we are on the critical write path. See HBASE-16278 for more
46   * details.
47   */
48  @InterfaceAudience.Private
49  class SequenceIdAccounting {
50  
51    private static final Log LOG = LogFactory.getLog(SequenceIdAccounting.class);
52    /**
53     * This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
54     * {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the
55     * lowest outstanding sequence ids EXCEPT when flushing. When we flush, the current
56     * lowest set for the region/column family are moved (atomically because of this lock) to
57     * {@link #flushingSequenceIds}.
58     * 
59     * <p>The two Maps are tied by this locking object EXCEPT when we go to update the lowest
60     * entry; see {@link #lowest(byte[], Set, Long)}. In here is a putIfAbsent call on
61     * {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest
62     * sequence id if we find that there is no entry for the current column family. There will be no
63     * entry only if we just came up OR we have moved aside current set of lowest sequence ids
64     * because the current set are being flushed (by putting them into {@link #flushingSequenceIds}).
65     * This is how we pick up the next 'lowest' sequence id per region per column family to be used
66     * figuring what is in the next flush.
67     */
68    private final Object tieLock = new Object();
69  
70    /**
71     * Map of encoded region names and family names to their OLDEST -- i.e. their first,
72     * the longest-lived, their 'earliest', the 'lowest' -- sequence id.
73     *
74     * <p>When we flush, the current lowest sequence ids get cleared and added to
75     * {@link #flushingSequenceIds}. The next append that comes in, is then added
76     * here to {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
77     *
78     * <p>If flush fails, currently server is aborted so no need to restore previous sequence ids.
79     * <p>Needs to be concurrent Maps because we use putIfAbsent updating oldest.
80     */
81    private final ConcurrentMap<byte[], ConcurrentMap<ImmutableByteArray, Long>>
82      lowestUnflushedSequenceIds = new ConcurrentHashMap<>();
83  
84    /**
85     * Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
86     * currently being flushed out to hfiles. Entries are moved here from
87     * {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held
88     * (so movement between the Maps is atomic).
89     */
90    private final Map<byte[], Map<ImmutableByteArray, Long>> flushingSequenceIds = new HashMap<>();
91  
92   /**
93    * Map of region encoded names to the latest/highest region sequence id.  Updated on each
94    * call to append.
95    * <p>
96    * This map uses byte[] as the key, and uses reference equality. It works in our use case as we
97    * use {@link HRegionInfo#getEncodedNameAsBytes()} as keys. For a given region, it always returns
98    * the same array.
99    */
100   private Map<byte[], Long> highestSequenceIds = new HashMap<>();
101 
102   /**
103    * Returns the lowest unflushed sequence id for the region.
104    * @param encodedRegionName
105    * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionName</code>. Will
106    * return {@link HConstants#NO_SEQNUM} when none.
107    */
108   long getLowestSequenceId(final byte[] encodedRegionName) {
109     synchronized (this.tieLock) {
110       Map<?, Long> m = this.flushingSequenceIds.get(encodedRegionName);
111       long flushingLowest = m != null ? getLowestSequenceId(m) : Long.MAX_VALUE;
112       m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
113       long unflushedLowest = m != null ? getLowestSequenceId(m) : HConstants.NO_SEQNUM;
114       return Math.min(flushingLowest, unflushedLowest);
115     }
116   }
117 
118   /**
119    * @param encodedRegionName
120    * @param familyName
121    * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionname</code> and
122    *         <code>familyName</code>. Returned sequenceid may be for an edit currently being
123    *         flushed.
124    */
125   long getLowestSequenceId(final byte[] encodedRegionName, final byte[] familyName) {
126     ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
127     synchronized (this.tieLock) {
128       Map<ImmutableByteArray, Long> m = this.flushingSequenceIds.get(encodedRegionName);
129       if (m != null) {
130         Long lowest = m.get(familyNameWrapper);
131         if (lowest != null) {
132           return lowest;
133         }
134       }
135       m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
136       if (m != null) {
137         Long lowest = m.get(familyNameWrapper);
138         if (lowest != null) {
139           return lowest;
140         }
141       }
142     }
143     return HConstants.NO_SEQNUM;
144   }
145 
146   /**
147    * Reset the accounting of highest sequenceid by regionname.
148    * @return Return the previous accounting Map of regions to the last sequence id written into
149    * each.
150    */
151   Map<byte[], Long> resetHighest() {
152     Map<byte[], Long> old = this.highestSequenceIds;
153     this.highestSequenceIds = new HashMap<byte[], Long>();
154     return old;
155   }
156 
157   /**
158    * We've been passed a new sequenceid for the region. Set it as highest seen for this region and
159    * if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing
160    * currently older.
161    * @param encodedRegionName
162    * @param families
163    * @param sequenceid
164    * @param lowest Whether to keep running account of oldest sequence id.
165    */
166   void update(byte[] encodedRegionName, Set<byte[]> families, long sequenceid,
167       final boolean lowest) {
168     Long l = Long.valueOf(sequenceid);
169     this.highestSequenceIds.put(encodedRegionName, l);
170     if (lowest) {
171       ConcurrentMap<ImmutableByteArray, Long> m = getOrCreateLowestSequenceIds(encodedRegionName);
172       for (byte[] familyName : families) {
173         m.putIfAbsent(ImmutableByteArray.wrap(familyName), l);
174       }
175     }
176   }
177 
178   ConcurrentMap<ImmutableByteArray, Long> getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
179     // Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
180     ConcurrentMap<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds
181         .get(encodedRegionName);
182     if (m != null) {
183       return m;
184     }
185     m = new ConcurrentHashMap<>();
186     // Another thread may have added it ahead of us.
187     ConcurrentMap<ImmutableByteArray, Long> alreadyPut = this.lowestUnflushedSequenceIds
188         .putIfAbsent(encodedRegionName, m);
189     return alreadyPut == null ? m : alreadyPut;
190   }
191 
192   /**
193    * @param sequenceids Map to search for lowest value.
194    * @return Lowest value found in <code>sequenceids</code>.
195    */
196   private static long getLowestSequenceId(Map<?, Long> sequenceids) {
197     long lowest = HConstants.NO_SEQNUM;
198     for (Long sid: sequenceids.values()) {
199       if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
200         lowest = sid.longValue();
201       }
202     }
203     return lowest;
204   }
205 
206   /**
207    * @param src
208    * @return New Map that has same keys as <code>src</code> but instead of a Map for a value, it
209    *         instead has found the smallest sequence id and it returns that as the value instead.
210    */
211   private <T extends Map<?, Long>> Map<byte[], Long> flattenToLowestSequenceId(Map<byte[], T> src) {
212     if (src == null || src.isEmpty()) {
213       return null;
214     }
215     Map<byte[], Long> tgt = new HashMap<>();
216     for (Map.Entry<byte[], T> entry : src.entrySet()) {
217       long lowestSeqId = getLowestSequenceId(entry.getValue());
218       if (lowestSeqId != HConstants.NO_SEQNUM) {
219         tgt.put(entry.getKey(), lowestSeqId);
220       }
221     }
222     return tgt;
223   }
224 
225   /**
226    * @param encodedRegionName Region to flush.
227    * @param families Families to flush. May be a subset of all families in the region.
228    * @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if
229    * we are flushing a subset of all families but there are no edits in those families not
230    * being flushed; in other words, this is effectively same as a flush of all of the region
231    * though we were passed a subset of regions. Otherwise, it returns the sequence id of the
232    * oldest/lowest outstanding edit.
233    */
234   Long startCacheFlush(final byte[] encodedRegionName, final Set<byte[]> families) {
235     Map<ImmutableByteArray, Long> oldSequenceIds = null;
236     Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
237     synchronized (tieLock) {
238       Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
239       if (m != null) {
240         // NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
241         // circumstance because another concurrent thread now may add sequenceids for this family
242         // (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
243         // is fine because updates are blocked when this method is called. Make sure!!!
244         for (byte[] familyName : families) {
245           ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
246           Long seqId = m.remove(familyNameWrapper);
247           if (seqId != null) {
248             if (oldSequenceIds == null) {
249               oldSequenceIds = new HashMap<>();
250             }
251             oldSequenceIds.put(familyNameWrapper, seqId);
252           }
253         }
254         if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
255           if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
256             LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName) +
257               ", sequenceid=" + oldSequenceIds);
258           }
259         }
260         if (m.isEmpty()) {
261           // Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
262           // even if the region is already moved to other server.
263           // Do not worry about data racing, we held write lock of region when calling
264           // startCacheFlush, so no one can add value to the map we removed.
265           this.lowestUnflushedSequenceIds.remove(encodedRegionName);
266         } else {
267           // Flushing a subset of the region families. Return the sequence id of the oldest entry.
268           lowestUnflushedInRegion = Collections.min(m.values());
269         }
270       }
271     }
272     // Do this check outside lock.
273     if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
274       // TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
275       // the region is already flushing (which would make this call invalid), or there
276       // were no appends after last flush, so why are we starting flush? Maybe we should
277       // assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
278       // For now preserve old logic.
279       LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
280     }
281     return lowestUnflushedInRegion;
282   }
283 
284   void completeCacheFlush(final byte[] encodedRegionName) {
285     synchronized (tieLock) {
286       this.flushingSequenceIds.remove(encodedRegionName);
287     }
288   }
289 
290   void abortCacheFlush(final byte[] encodedRegionName) {
291     // Method is called when we are crashing down because failed write flush AND it is called
292     // if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
293     Map<ImmutableByteArray, Long> flushing = null;
294     Map<ImmutableByteArray, Long> tmpMap = new HashMap<>();
295     // Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
296     // happened in startCacheFlush. During prepare phase, we have update lock on the region so
297     // no edits should be coming in via append.
298     synchronized (tieLock) {
299       flushing = this.flushingSequenceIds.remove(encodedRegionName);
300       if (flushing != null) {
301         Map<ImmutableByteArray, Long> unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
302         for (Map.Entry<ImmutableByteArray, Long> e: flushing.entrySet()) {
303           // Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
304           // value, it will now be in tmpMap.
305           tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
306         }
307       }
308     }
309 
310     // Here we are doing some 'test' to see if edits are going in out of order. What is it for?
311     // Carried over from old code.
312     if (flushing != null) {
313       for (Map.Entry<ImmutableByteArray, Long> e : flushing.entrySet()) {
314         Long currentId = tmpMap.get(e.getKey());
315         if (currentId != null && currentId.longValue() <= e.getValue().longValue()) {
316           String errorStr = Bytes.toString(encodedRegionName) + " family "
317               + e.getKey().toStringUtf8() + " acquired edits out of order current memstore seq="
318               + currentId + ", previous oldest unflushed id=" + e.getValue();
319           LOG.error(errorStr);
320           Runtime.getRuntime().halt(1);
321         }
322       }
323     }
324   }
325 
326   /**
327    * See if passed <code>sequenceids</code> are lower -- i.e. earlier -- than any outstanding
328    * sequenceids, sequenceids we are holding on to in this accounting instance.
329    * @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make sense for it to
330    *          be null).
331    * @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
332    */
333   boolean areAllLower(Map<byte[], Long> sequenceids) {
334     Map<byte[], Long> flushing = null;
335     Map<byte[], Long> unflushed = null;
336     synchronized (this.tieLock) {
337       // Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
338       // data structures to use in tests below.
339       flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
340       unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
341     }
342     for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
343       long oldestFlushing = Long.MAX_VALUE;
344       long oldestUnflushed = Long.MAX_VALUE;
345       if (flushing != null && flushing.containsKey(e.getKey())) {
346         oldestFlushing = flushing.get(e.getKey());
347       }
348       if (unflushed != null && unflushed.containsKey(e.getKey())) {
349         oldestUnflushed = unflushed.get(e.getKey());
350       }
351       long min = Math.min(oldestFlushing, oldestUnflushed);
352       if (min <= e.getValue()) {
353         return false;
354       }
355     }
356     return true;
357   }
358 
359   /**
360    * Iterates over the given Map and compares sequence ids with corresponding entries in
361    * {@link #oldestUnflushedRegionSequenceIds}. If a region in
362    * {@link #oldestUnflushedRegionSequenceIds} has a sequence id less than that passed in
363    * <code>sequenceids</code> then return it.
364    * @param sequenceids Sequenceids keyed by encoded region name.
365    * @return regions found in this instance with sequence ids less than those passed in.
366    */
367   byte[][] findLower(Map<byte[], Long> sequenceids) {
368     List<byte[]> toFlush = null;
369     // Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
370     synchronized (tieLock) {
371       for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
372         Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(e.getKey());
373         if (m == null) {
374           continue;
375         }
376         // The lowest sequence id outstanding for this region.
377         long lowest = getLowestSequenceId(m);
378         if (lowest != HConstants.NO_SEQNUM && lowest <= e.getValue()) {
379           if (toFlush == null) {
380             toFlush = new ArrayList<byte[]>();
381           }
382           toFlush.add(e.getKey());
383         }
384       }
385     }
386     return toFlush == null ? null : toFlush.toArray(new byte[0][]);
387   }
388 }