1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.regionserver.wal;
19
20 import java.util.ArrayList;
21 import java.util.Collections;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.concurrent.ConcurrentHashMap;
27 import java.util.concurrent.ConcurrentMap;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.hbase.HConstants;
32 import org.apache.hadoop.hbase.HRegionInfo;
33 import org.apache.hadoop.hbase.classification.InterfaceAudience;
34 import org.apache.hadoop.hbase.util.Bytes;
35 import org.apache.hadoop.hbase.util.ImmutableByteArray;
36
37 /**
38 * Accounting of sequence ids per region and then by column family. So we can our accounting
39 * current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance can
40 * keep abreast of the state of sequence id persistence. Also call update per append.
41 * <p>
42 * For the implementation, we assume that all the {@code encodedRegionName} passed in is gotten by
43 * {@link HRegionInfo#getEncodedNameAsBytes()}. So it is safe to use it as a hash key. And for
44 * family name, we use {@link ImmutableByteArray} as key. This is because hash based map is much
45 * faster than RBTree or CSLM and here we are on the critical write path. See HBASE-16278 for more
46 * details.
47 */
48 @InterfaceAudience.Private
49 class SequenceIdAccounting {
50
51 private static final Log LOG = LogFactory.getLog(SequenceIdAccounting.class);
52 /**
53 * This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
54 * {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the
55 * lowest outstanding sequence ids EXCEPT when flushing. When we flush, the current
56 * lowest set for the region/column family are moved (atomically because of this lock) to
57 * {@link #flushingSequenceIds}.
58 *
59 * <p>The two Maps are tied by this locking object EXCEPT when we go to update the lowest
60 * entry; see {@link #lowest(byte[], Set, Long)}. In here is a putIfAbsent call on
61 * {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest
62 * sequence id if we find that there is no entry for the current column family. There will be no
63 * entry only if we just came up OR we have moved aside current set of lowest sequence ids
64 * because the current set are being flushed (by putting them into {@link #flushingSequenceIds}).
65 * This is how we pick up the next 'lowest' sequence id per region per column family to be used
66 * figuring what is in the next flush.
67 */
68 private final Object tieLock = new Object();
69
70 /**
71 * Map of encoded region names and family names to their OLDEST -- i.e. their first,
72 * the longest-lived, their 'earliest', the 'lowest' -- sequence id.
73 *
74 * <p>When we flush, the current lowest sequence ids get cleared and added to
75 * {@link #flushingSequenceIds}. The next append that comes in, is then added
76 * here to {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
77 *
78 * <p>If flush fails, currently server is aborted so no need to restore previous sequence ids.
79 * <p>Needs to be concurrent Maps because we use putIfAbsent updating oldest.
80 */
81 private final ConcurrentMap<byte[], ConcurrentMap<ImmutableByteArray, Long>>
82 lowestUnflushedSequenceIds = new ConcurrentHashMap<>();
83
84 /**
85 * Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
86 * currently being flushed out to hfiles. Entries are moved here from
87 * {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held
88 * (so movement between the Maps is atomic).
89 */
90 private final Map<byte[], Map<ImmutableByteArray, Long>> flushingSequenceIds = new HashMap<>();
91
92 /**
93 * Map of region encoded names to the latest/highest region sequence id. Updated on each
94 * call to append.
95 * <p>
96 * This map uses byte[] as the key, and uses reference equality. It works in our use case as we
97 * use {@link HRegionInfo#getEncodedNameAsBytes()} as keys. For a given region, it always returns
98 * the same array.
99 */
100 private Map<byte[], Long> highestSequenceIds = new HashMap<>();
101
102 /**
103 * Returns the lowest unflushed sequence id for the region.
104 * @param encodedRegionName
105 * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionName</code>. Will
106 * return {@link HConstants#NO_SEQNUM} when none.
107 */
108 long getLowestSequenceId(final byte[] encodedRegionName) {
109 synchronized (this.tieLock) {
110 Map<?, Long> m = this.flushingSequenceIds.get(encodedRegionName);
111 long flushingLowest = m != null ? getLowestSequenceId(m) : Long.MAX_VALUE;
112 m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
113 long unflushedLowest = m != null ? getLowestSequenceId(m) : HConstants.NO_SEQNUM;
114 return Math.min(flushingLowest, unflushedLowest);
115 }
116 }
117
118 /**
119 * @param encodedRegionName
120 * @param familyName
121 * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionname</code> and
122 * <code>familyName</code>. Returned sequenceid may be for an edit currently being
123 * flushed.
124 */
125 long getLowestSequenceId(final byte[] encodedRegionName, final byte[] familyName) {
126 ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
127 synchronized (this.tieLock) {
128 Map<ImmutableByteArray, Long> m = this.flushingSequenceIds.get(encodedRegionName);
129 if (m != null) {
130 Long lowest = m.get(familyNameWrapper);
131 if (lowest != null) {
132 return lowest;
133 }
134 }
135 m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
136 if (m != null) {
137 Long lowest = m.get(familyNameWrapper);
138 if (lowest != null) {
139 return lowest;
140 }
141 }
142 }
143 return HConstants.NO_SEQNUM;
144 }
145
146 /**
147 * Reset the accounting of highest sequenceid by regionname.
148 * @return Return the previous accounting Map of regions to the last sequence id written into
149 * each.
150 */
151 Map<byte[], Long> resetHighest() {
152 Map<byte[], Long> old = this.highestSequenceIds;
153 this.highestSequenceIds = new HashMap<byte[], Long>();
154 return old;
155 }
156
157 /**
158 * We've been passed a new sequenceid for the region. Set it as highest seen for this region and
159 * if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing
160 * currently older.
161 * @param encodedRegionName
162 * @param families
163 * @param sequenceid
164 * @param lowest Whether to keep running account of oldest sequence id.
165 */
166 void update(byte[] encodedRegionName, Set<byte[]> families, long sequenceid,
167 final boolean lowest) {
168 Long l = Long.valueOf(sequenceid);
169 this.highestSequenceIds.put(encodedRegionName, l);
170 if (lowest) {
171 ConcurrentMap<ImmutableByteArray, Long> m = getOrCreateLowestSequenceIds(encodedRegionName);
172 for (byte[] familyName : families) {
173 m.putIfAbsent(ImmutableByteArray.wrap(familyName), l);
174 }
175 }
176 }
177
178 ConcurrentMap<ImmutableByteArray, Long> getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
179 // Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
180 ConcurrentMap<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds
181 .get(encodedRegionName);
182 if (m != null) {
183 return m;
184 }
185 m = new ConcurrentHashMap<>();
186 // Another thread may have added it ahead of us.
187 ConcurrentMap<ImmutableByteArray, Long> alreadyPut = this.lowestUnflushedSequenceIds
188 .putIfAbsent(encodedRegionName, m);
189 return alreadyPut == null ? m : alreadyPut;
190 }
191
192 /**
193 * @param sequenceids Map to search for lowest value.
194 * @return Lowest value found in <code>sequenceids</code>.
195 */
196 private static long getLowestSequenceId(Map<?, Long> sequenceids) {
197 long lowest = HConstants.NO_SEQNUM;
198 for (Long sid: sequenceids.values()) {
199 if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
200 lowest = sid.longValue();
201 }
202 }
203 return lowest;
204 }
205
206 /**
207 * @param src
208 * @return New Map that has same keys as <code>src</code> but instead of a Map for a value, it
209 * instead has found the smallest sequence id and it returns that as the value instead.
210 */
211 private <T extends Map<?, Long>> Map<byte[], Long> flattenToLowestSequenceId(Map<byte[], T> src) {
212 if (src == null || src.isEmpty()) {
213 return null;
214 }
215 Map<byte[], Long> tgt = new HashMap<>();
216 for (Map.Entry<byte[], T> entry : src.entrySet()) {
217 long lowestSeqId = getLowestSequenceId(entry.getValue());
218 if (lowestSeqId != HConstants.NO_SEQNUM) {
219 tgt.put(entry.getKey(), lowestSeqId);
220 }
221 }
222 return tgt;
223 }
224
225 /**
226 * @param encodedRegionName Region to flush.
227 * @param families Families to flush. May be a subset of all families in the region.
228 * @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if
229 * we are flushing a subset of all families but there are no edits in those families not
230 * being flushed; in other words, this is effectively same as a flush of all of the region
231 * though we were passed a subset of regions. Otherwise, it returns the sequence id of the
232 * oldest/lowest outstanding edit.
233 */
234 Long startCacheFlush(final byte[] encodedRegionName, final Set<byte[]> families) {
235 Map<ImmutableByteArray, Long> oldSequenceIds = null;
236 Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
237 synchronized (tieLock) {
238 Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
239 if (m != null) {
240 // NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
241 // circumstance because another concurrent thread now may add sequenceids for this family
242 // (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
243 // is fine because updates are blocked when this method is called. Make sure!!!
244 for (byte[] familyName : families) {
245 ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
246 Long seqId = m.remove(familyNameWrapper);
247 if (seqId != null) {
248 if (oldSequenceIds == null) {
249 oldSequenceIds = new HashMap<>();
250 }
251 oldSequenceIds.put(familyNameWrapper, seqId);
252 }
253 }
254 if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
255 if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
256 LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName) +
257 ", sequenceid=" + oldSequenceIds);
258 }
259 }
260 if (m.isEmpty()) {
261 // Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
262 // even if the region is already moved to other server.
263 // Do not worry about data racing, we held write lock of region when calling
264 // startCacheFlush, so no one can add value to the map we removed.
265 this.lowestUnflushedSequenceIds.remove(encodedRegionName);
266 } else {
267 // Flushing a subset of the region families. Return the sequence id of the oldest entry.
268 lowestUnflushedInRegion = Collections.min(m.values());
269 }
270 }
271 }
272 // Do this check outside lock.
273 if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
274 // TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
275 // the region is already flushing (which would make this call invalid), or there
276 // were no appends after last flush, so why are we starting flush? Maybe we should
277 // assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
278 // For now preserve old logic.
279 LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
280 }
281 return lowestUnflushedInRegion;
282 }
283
284 void completeCacheFlush(final byte[] encodedRegionName) {
285 synchronized (tieLock) {
286 this.flushingSequenceIds.remove(encodedRegionName);
287 }
288 }
289
290 void abortCacheFlush(final byte[] encodedRegionName) {
291 // Method is called when we are crashing down because failed write flush AND it is called
292 // if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
293 Map<ImmutableByteArray, Long> flushing = null;
294 Map<ImmutableByteArray, Long> tmpMap = new HashMap<>();
295 // Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
296 // happened in startCacheFlush. During prepare phase, we have update lock on the region so
297 // no edits should be coming in via append.
298 synchronized (tieLock) {
299 flushing = this.flushingSequenceIds.remove(encodedRegionName);
300 if (flushing != null) {
301 Map<ImmutableByteArray, Long> unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
302 for (Map.Entry<ImmutableByteArray, Long> e: flushing.entrySet()) {
303 // Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
304 // value, it will now be in tmpMap.
305 tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
306 }
307 }
308 }
309
310 // Here we are doing some 'test' to see if edits are going in out of order. What is it for?
311 // Carried over from old code.
312 if (flushing != null) {
313 for (Map.Entry<ImmutableByteArray, Long> e : flushing.entrySet()) {
314 Long currentId = tmpMap.get(e.getKey());
315 if (currentId != null && currentId.longValue() <= e.getValue().longValue()) {
316 String errorStr = Bytes.toString(encodedRegionName) + " family "
317 + e.getKey().toStringUtf8() + " acquired edits out of order current memstore seq="
318 + currentId + ", previous oldest unflushed id=" + e.getValue();
319 LOG.error(errorStr);
320 Runtime.getRuntime().halt(1);
321 }
322 }
323 }
324 }
325
326 /**
327 * See if passed <code>sequenceids</code> are lower -- i.e. earlier -- than any outstanding
328 * sequenceids, sequenceids we are holding on to in this accounting instance.
329 * @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make sense for it to
330 * be null).
331 * @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
332 */
333 boolean areAllLower(Map<byte[], Long> sequenceids) {
334 Map<byte[], Long> flushing = null;
335 Map<byte[], Long> unflushed = null;
336 synchronized (this.tieLock) {
337 // Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
338 // data structures to use in tests below.
339 flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
340 unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
341 }
342 for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
343 long oldestFlushing = Long.MAX_VALUE;
344 long oldestUnflushed = Long.MAX_VALUE;
345 if (flushing != null && flushing.containsKey(e.getKey())) {
346 oldestFlushing = flushing.get(e.getKey());
347 }
348 if (unflushed != null && unflushed.containsKey(e.getKey())) {
349 oldestUnflushed = unflushed.get(e.getKey());
350 }
351 long min = Math.min(oldestFlushing, oldestUnflushed);
352 if (min <= e.getValue()) {
353 return false;
354 }
355 }
356 return true;
357 }
358
359 /**
360 * Iterates over the given Map and compares sequence ids with corresponding entries in
361 * {@link #oldestUnflushedRegionSequenceIds}. If a region in
362 * {@link #oldestUnflushedRegionSequenceIds} has a sequence id less than that passed in
363 * <code>sequenceids</code> then return it.
364 * @param sequenceids Sequenceids keyed by encoded region name.
365 * @return regions found in this instance with sequence ids less than those passed in.
366 */
367 byte[][] findLower(Map<byte[], Long> sequenceids) {
368 List<byte[]> toFlush = null;
369 // Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
370 synchronized (tieLock) {
371 for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
372 Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(e.getKey());
373 if (m == null) {
374 continue;
375 }
376 // The lowest sequence id outstanding for this region.
377 long lowest = getLowestSequenceId(m);
378 if (lowest != HConstants.NO_SEQNUM && lowest <= e.getValue()) {
379 if (toFlush == null) {
380 toFlush = new ArrayList<byte[]>();
381 }
382 toFlush.add(e.getKey());
383 }
384 }
385 }
386 return toFlush == null ? null : toFlush.toArray(new byte[0][]);
387 }
388 }