View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.util.ArrayList;
25  import java.util.Collections;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.NavigableSet;
29  import java.util.SortedSet;
30  import java.util.concurrent.atomic.AtomicInteger;
31  import java.util.concurrent.atomic.AtomicLong;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.hbase.classification.InterfaceAudience;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.hbase.Cell;
38  import org.apache.hadoop.hbase.CellUtil;
39  import org.apache.hadoop.hbase.HBaseConfiguration;
40  import org.apache.hadoop.hbase.HConstants;
41  import org.apache.hadoop.hbase.KeyValue;
42  import org.apache.hadoop.hbase.KeyValueUtil;
43  import org.apache.hadoop.hbase.client.Scan;
44  import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
45  import org.apache.hadoop.hbase.io.TimeRange;
46  import org.apache.hadoop.hbase.util.ByteRange;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.ClassSize;
49  import org.apache.hadoop.hbase.util.CollectionBackedScanner;
50  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
51  import org.apache.hadoop.hbase.util.ReflectionUtils;
52  import org.apache.htrace.Trace;
53  
54  /**
55   * The MemStore holds in-memory modifications to the Store.  Modifications
56   * are {@link Cell}s.  When asked to flush, current memstore is moved
57   * to snapshot and is cleared.  We continue to serve edits out of new memstore
58   * and backing snapshot until flusher reports in that the flush succeeded. At
59   * this point we let the snapshot go.
60   *  <p>
61   * The MemStore functions should not be called in parallel. Callers should hold
62   *  write and read locks. This is done in {@link HStore}.
63   *  </p>
64   *
65   * TODO: Adjust size of the memstore when we remove items because they have
66   * been deleted.
67   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
68   * in KV size.
69   */
70  @InterfaceAudience.Private
71  public class DefaultMemStore implements MemStore {
72    private static final Log LOG = LogFactory.getLog(DefaultMemStore.class);
73    static final String USEMSLAB_KEY = "hbase.hregion.memstore.mslab.enabled";
74    private static final boolean USEMSLAB_DEFAULT = true;
75    private static final String MSLAB_CLASS_NAME = "hbase.regionserver.mslab.class";
76  
77    private Configuration conf;
78  
79    final KeyValue.KVComparator comparator;
80  
81    // Used to track when to flush
82    private volatile long timeOfOldestEdit = Long.MAX_VALUE;
83  
84    private volatile long snapshotId;
85    private volatile boolean tagsPresent;
86  
87    volatile Section activeSection;
88    volatile Section snapshotSection;
89  
90    /**
91     * Default constructor. Used for tests.
92     */
93    public DefaultMemStore() {
94      this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
95    }
96  
97    /**
98     * Constructor.
99     * @param c Comparator
100    */
101   public DefaultMemStore(final Configuration conf,
102                   final KeyValue.KVComparator c) {
103     this.conf = conf;
104     this.comparator = c;
105     this.activeSection = Section.newActiveSection(comparator, conf);
106     this.snapshotSection = Section.newSnapshotSection(comparator);
107   }
108 
109   /**
110    * Creates a snapshot of the current memstore.
111    * Snapshot must be cleared by call to {@link #clearSnapshot(long)}
112    */
113   @Override
114   public MemStoreSnapshot snapshot() {
115     // If snapshot currently has entries, then flusher failed or didn't call
116     // cleanup.  Log a warning.
117     if (!snapshotSection.getCellSkipListSet().isEmpty()) {
118       LOG.warn("Snapshot called again without clearing previous. " +
119           "Doing nothing. Another ongoing flush or did we fail last attempt?");
120     } else {
121       this.snapshotId = EnvironmentEdgeManager.currentTime();
122       if (!activeSection.getCellSkipListSet().isEmpty()) {
123         snapshotSection = activeSection;
124         activeSection = Section.newActiveSection(comparator, conf);
125         snapshotSection.getHeapSize().addAndGet(-DEEP_OVERHEAD);
126         timeOfOldestEdit = Long.MAX_VALUE;
127       }
128     }
129     MemStoreSnapshot memStoreSnapshot = new MemStoreSnapshot(this.snapshotId,
130         snapshotSection.getCellsCount().get(), snapshotSection.getHeapSize().get(),
131         snapshotSection.getTimeRangeTracker(),
132         new CollectionBackedScanner(snapshotSection.getCellSkipListSet(), this.comparator),
133         this.tagsPresent);
134     this.tagsPresent = false;
135     return memStoreSnapshot;
136   }
137 
138   /**
139    * The passed snapshot was successfully persisted; it can be let go.
140    * @param id Id of the snapshot to clean out.
141    * @throws UnexpectedStateException
142    * @see #snapshot()
143    */
144   @Override
145   public void clearSnapshot(long id) throws UnexpectedStateException {
146     if (this.snapshotId == -1) return;  // already cleared
147     if (this.snapshotId != id) {
148       throw new UnexpectedStateException("Current snapshot id is " + this.snapshotId + ",passed "
149           + id);
150     }
151     // OK. Passed in snapshot is same as current snapshot.
152     MemStoreLAB tmpAllocator = snapshotSection.getMemStoreLAB();
153     snapshotSection = Section.newSnapshotSection(comparator);
154     if (tmpAllocator != null) {
155       tmpAllocator.close();
156     }
157     this.snapshotId = -1;
158   }
159 
160   @Override
161   public long getFlushableSize() {
162     long snapshotSize = snapshotSection.getHeapSize().get();
163     return snapshotSize > 0 ? snapshotSize : keySize();
164   }
165 
166   @Override
167   public long getSnapshotSize() {
168     return snapshotSection.getHeapSize().get();
169   }
170 
171   /**
172    * Write an update
173    * @param cell
174    * @return approximate size of the passed cell.
175    */
176   @Override
177   public long add(Cell cell) {
178     Cell toAdd = maybeCloneWithAllocator(cell);
179     boolean mslabUsed = (toAdd != cell);
180     return internalAdd(toAdd, mslabUsed);
181   }
182 
183   @Override
184   public long add(Iterable<Cell> cells) {
185     long size = 0;
186     for (Cell cell : cells) {
187       size += add(cell);
188     }
189     return size;
190   }
191 
192   @Override
193   public long timeOfOldestEdit() {
194     return timeOfOldestEdit;
195   }
196 
197   private boolean addToCellSet(Cell e) {
198     boolean b = this.activeSection.getCellSkipListSet().add(e);
199     // In no tags case this NoTagsKeyValue.getTagsLength() is a cheap call.
200     // When we use ACL CP or Visibility CP which deals with Tags during
201     // mutation, the TagRewriteCell.getTagsLength() is a cheaper call. We do not
202     // parse the byte[] to identify the tags length.
203     if(e.getTagsLength() > 0) {
204       tagsPresent = true;
205     }
206     setOldestEditTimeToNow();
207     return b;
208   }
209 
210   private boolean removeFromCellSet(Cell e) {
211     boolean b = this.activeSection.getCellSkipListSet().remove(e);
212     setOldestEditTimeToNow();
213     return b;
214   }
215 
216   void setOldestEditTimeToNow() {
217     if (timeOfOldestEdit == Long.MAX_VALUE) {
218       timeOfOldestEdit = EnvironmentEdgeManager.currentTime();
219     }
220   }
221 
222   /**
223    * Internal version of add() that doesn't clone Cells with the
224    * allocator, and doesn't take the lock.
225    *
226    * Callers should ensure they already have the read lock taken
227    * @param toAdd the cell to add
228    * @param mslabUsed whether using MSLAB
229    * @return the heap size change in bytes
230    */
231   private long internalAdd(final Cell toAdd, boolean mslabUsed) {
232     boolean notPresent = addToCellSet(toAdd);
233     long s = heapSizeChange(toAdd, notPresent);
234     if (notPresent) {
235       activeSection.getCellsCount().incrementAndGet();
236     }
237     // If there's already a same cell in the CellSet and we are using MSLAB, we must count in the
238     // MSLAB allocation size as well, or else there will be memory leak (occupied heap size larger
239     // than the counted number)
240     if (!notPresent && mslabUsed) {
241       s += getCellLength(toAdd);
242     }
243     activeSection.getTimeRangeTracker().includeTimestamp(toAdd);
244     activeSection.getHeapSize().addAndGet(s);
245     return s;
246   }
247 
248   /**
249    * Get cell length after serialized in {@link KeyValue}
250    */
251   int getCellLength(Cell cell) {
252     return KeyValueUtil.length(cell);
253   }
254 
255   private Cell maybeCloneWithAllocator(Cell cell) {
256     if (activeSection.getMemStoreLAB() == null) {
257       return cell;
258     }
259 
260     int len = getCellLength(cell);
261     ByteRange alloc = activeSection.getMemStoreLAB().allocateBytes(len);
262     if (alloc == null) {
263       // The allocation was too large, allocator decided
264       // not to do anything with it.
265       return cell;
266     }
267     assert alloc.getBytes() != null;
268     KeyValueUtil.appendToByteArray(cell, alloc.getBytes(), alloc.getOffset());
269     KeyValue newKv = new KeyValue(alloc.getBytes(), alloc.getOffset(), len);
270     newKv.setSequenceId(cell.getSequenceId());
271     return newKv;
272   }
273 
274   /**
275    * Remove n key from the memstore. Only cells that have the same key and the
276    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
277    * in this call. It is possible that we can optimize this method by using
278    * tailMap/iterator, but since this method is called rarely (only for
279    * error recovery), we can leave those optimization for the future.
280    * @param cell
281    */
282   @Override
283   public void rollback(Cell cell) {
284     // If the key is in the snapshot, delete it. We should not update
285     // this.size, because that tracks the size of only the memstore and
286     // not the snapshot. The flush of this snapshot to disk has not
287     // yet started because Store.flush() waits for all rwcc transactions to
288     // commit before starting the flush to disk.
289     Cell found = snapshotSection.getCellSkipListSet().get(cell);
290     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
291       snapshotSection.getCellSkipListSet().remove(cell);
292       long sz = heapSizeChange(cell, true);
293       snapshotSection.getHeapSize().addAndGet(-sz);
294       snapshotSection.getCellsCount().decrementAndGet();
295     }
296 
297     // If the key is in the memstore, delete it. Update this.size.
298     found = activeSection.getCellSkipListSet().get(cell);
299     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
300       removeFromCellSet(found);
301       long sz = heapSizeChange(found, true);
302       activeSection.getHeapSize().addAndGet(-sz);
303       activeSection.getCellsCount().decrementAndGet();
304     }
305   }
306 
307   /**
308    * Write a delete
309    * @param deleteCell
310    * @return approximate size of the passed key and value.
311    */
312   @Override
313   public long delete(Cell deleteCell) {
314     Cell toAdd = maybeCloneWithAllocator(deleteCell);
315     boolean mslabUsed = (toAdd != deleteCell);
316     return internalAdd(toAdd, mslabUsed);
317   }
318 
319   /**
320    * @param cell Find the row that comes after this one.  If null, we return the
321    * first.
322    * @return Next row or null if none found.
323    */
324   Cell getNextRow(final Cell cell) {
325     return getLowest(getNextRow(cell, activeSection.getCellSkipListSet()),
326           getNextRow(cell, snapshotSection.getCellSkipListSet()));
327   }
328 
329   /*
330    * @param a
331    * @param b
332    * @return Return lowest of a or b or null if both a and b are null
333    */
334   private Cell getLowest(final Cell a, final Cell b) {
335     if (a == null) {
336       return b;
337     }
338     if (b == null) {
339       return a;
340     }
341     return comparator.compareRows(a, b) <= 0? a: b;
342   }
343 
344   /*
345    * @param key Find row that follows this one.  If null, return first.
346    * @param map Set to look in for a row beyond <code>row</code>.
347    * @return Next row or null if none found.  If one found, will be a new
348    * KeyValue -- can be destroyed by subsequent calls to this method.
349    */
350   private Cell getNextRow(final Cell key,
351       final NavigableSet<Cell> set) {
352     Cell result = null;
353     SortedSet<Cell> tail = key == null? set: set.tailSet(key);
354     // Iterate until we fall into the next row; i.e. move off current row
355     for (Cell cell: tail) {
356       if (comparator.compareRows(cell, key) <= 0)
357         continue;
358       // Note: Not suppressing deletes or expired cells.  Needs to be handled
359       // by higher up functions.
360       result = cell;
361       break;
362     }
363     return result;
364   }
365 
366   /**
367    * @param state column/delete tracking state
368    */
369   @Override
370   public void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
371     getRowKeyAtOrBefore(activeSection.getCellSkipListSet(), state);
372     getRowKeyAtOrBefore(snapshotSection.getCellSkipListSet(), state);
373   }
374 
375   /*
376    * @param set
377    * @param state Accumulates deletes and candidates.
378    */
379   private void getRowKeyAtOrBefore(final NavigableSet<Cell> set,
380       final GetClosestRowBeforeTracker state) {
381     if (set.isEmpty()) {
382       return;
383     }
384     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
385       // Found nothing in row.  Try backing up.
386       getRowKeyBefore(set, state);
387     }
388   }
389 
390   /*
391    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
392    * we have been passed the first possible key on a row.  As we walk forward
393    * we accumulate deletes until we hit a candidate on the row at which point
394    * we return.
395    * @param set
396    * @param firstOnRow First possible key on this row.
397    * @param state
398    * @return True if we found a candidate walking this row.
399    */
400   private boolean walkForwardInSingleRow(final SortedSet<Cell> set,
401       final Cell firstOnRow, final GetClosestRowBeforeTracker state) {
402     boolean foundCandidate = false;
403     SortedSet<Cell> tail = set.tailSet(firstOnRow);
404     if (tail.isEmpty()) return foundCandidate;
405     for (Iterator<Cell> i = tail.iterator(); i.hasNext();) {
406       Cell kv = i.next();
407       // Did we go beyond the target row? If so break.
408       if (state.isTooFar(kv, firstOnRow)) break;
409       if (state.isExpired(kv)) {
410         i.remove();
411         continue;
412       }
413       // If we added something, this row is a contender. break.
414       if (state.handle(kv)) {
415         foundCandidate = true;
416         break;
417       }
418     }
419     return foundCandidate;
420   }
421 
422   /*
423    * Walk backwards through the passed set a row at a time until we run out of
424    * set or until we get a candidate.
425    * @param set
426    * @param state
427    */
428   private void getRowKeyBefore(NavigableSet<Cell> set,
429       final GetClosestRowBeforeTracker state) {
430     Cell firstOnRow = state.getTargetKey();
431     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
432         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
433       // Make sure we don't fall out of our table.
434       if (!state.isTargetTable(p.cell)) break;
435       // Stop looking if we've exited the better candidate range.
436       if (!state.isBetterCandidate(p.cell)) break;
437       // Make into firstOnRow
438       firstOnRow = new KeyValue(p.cell.getRowArray(), p.cell.getRowOffset(), p.cell.getRowLength(),
439           HConstants.LATEST_TIMESTAMP);
440       // If we find something, break;
441       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
442     }
443   }
444 
445   /**
446    * Only used by tests. TODO: Remove
447    *
448    * Given the specs of a column, update it, first by inserting a new record,
449    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
450    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
451    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
452    * get the new value, or the old value and all readers will eventually only see the new
453    * value after the old was removed.
454    *
455    * @param row
456    * @param family
457    * @param qualifier
458    * @param newValue
459    * @param now
460    * @return  Timestamp
461    */
462   @Override
463   public long updateColumnValue(byte[] row,
464                                 byte[] family,
465                                 byte[] qualifier,
466                                 long newValue,
467                                 long now) {
468     Cell firstCell = KeyValueUtil.createFirstOnRow(row, family, qualifier);
469     // Is there a Cell in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
470     SortedSet<Cell> snSs = snapshotSection.getCellSkipListSet().tailSet(firstCell);
471     if (!snSs.isEmpty()) {
472       Cell snc = snSs.first();
473       // is there a matching Cell in the snapshot?
474       if (CellUtil.matchingRow(snc, firstCell) && CellUtil.matchingQualifier(snc, firstCell)) {
475         if (snc.getTimestamp() == now) {
476           // poop,
477           now += 1;
478         }
479       }
480     }
481 
482     // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
483     // But the timestamp should also be max(now, mostRecentTsInMemstore)
484 
485     // so we cant add the new Cell w/o knowing what's there already, but we also
486     // want to take this chance to delete some cells. So two loops (sad)
487 
488     SortedSet<Cell> ss = activeSection.getCellSkipListSet().tailSet(firstCell);
489     for (Cell cell : ss) {
490       // if this isnt the row we are interested in, then bail:
491       if (!CellUtil.matchingColumn(cell, family, qualifier)
492           || !CellUtil.matchingRow(cell, firstCell)) {
493         break; // rows dont match, bail.
494       }
495 
496       // if the qualifier matches and it's a put, just RM it out of the cellSet.
497       if (cell.getTypeByte() == KeyValue.Type.Put.getCode() &&
498           cell.getTimestamp() > now && CellUtil.matchingQualifier(firstCell, cell)) {
499         now = cell.getTimestamp();
500       }
501     }
502 
503     // create or update (upsert) a new Cell with
504     // 'now' and a 0 memstoreTS == immediately visible
505     List<Cell> cells = new ArrayList<Cell>(1);
506     cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
507     return upsert(cells, 1L, null);
508   }
509 
510   /**
511    * Update or insert the specified KeyValues.
512    * <p>
513    * For each KeyValue, insert into MemStore.  This will atomically upsert the
514    * value for that row/family/qualifier.  If a KeyValue did already exist,
515    * it will then be removed.
516    * <p>
517    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
518    * be immediately visible.  May want to change this so it is atomic across
519    * all KeyValues.
520    * <p>
521    * This is called under row lock, so Get operations will still see updates
522    * atomically.  Scans will only see each KeyValue update as atomic.
523    *
524    * @param cells
525    * @param readpoint readpoint below which we can safely remove duplicate KVs
526    * @param removedCells collect the removed cells. It can be null.
527    * @return change in memstore size
528    */
529   @Override
530   public long upsert(Iterable<Cell> cells, long readpoint, List<Cell> removedCells) {
531     long size = 0;
532     for (Cell cell : cells) {
533       size += upsert(cell, readpoint, removedCells);
534     }
535     return size;
536   }
537 
538   /**
539    * Inserts the specified KeyValue into MemStore and deletes any existing
540    * versions of the same row/family/qualifier as the specified KeyValue.
541    * <p>
542    * First, the specified KeyValue is inserted into the Memstore.
543    * <p>
544    * If there are any existing KeyValues in this MemStore with the same row,
545    * family, and qualifier, they are removed.
546    * <p>
547    * Callers must hold the read lock.
548    *
549    * @param cell
550    * @return change in size of MemStore
551    */
552   private long upsert(Cell cell, long readpoint, List<Cell> removedCells) {
553     // Add the Cell to the MemStore
554     // Use the internalAdd method here since we (a) already have a lock
555     // and (b) cannot safely use the MSLAB here without potentially
556     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
557     // test that triggers the pathological case if we don't avoid MSLAB
558     // here.
559     long addedSize = internalAdd(cell, false);
560 
561     // Get the Cells for the row/family/qualifier regardless of timestamp.
562     // For this case we want to clean up any other puts
563     Cell firstCell = KeyValueUtil.createFirstOnRow(
564         cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
565         cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
566         cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
567     SortedSet<Cell> ss = activeSection.getCellSkipListSet().tailSet(firstCell);
568     Iterator<Cell> it = ss.iterator();
569     // versions visible to oldest scanner
570     int versionsVisible = 0;
571     while ( it.hasNext() ) {
572       Cell cur = it.next();
573 
574       if (cell == cur) {
575         // ignore the one just put in
576         continue;
577       }
578       // check that this is the row and column we are interested in, otherwise bail
579       if (CellUtil.matchingRow(cell, cur) && CellUtil.matchingQualifier(cell, cur)) {
580         // only remove Puts that concurrent scanners cannot possibly see
581         if (cur.getTypeByte() == KeyValue.Type.Put.getCode() &&
582             cur.getSequenceId() <= readpoint) {
583           if (versionsVisible >= 1) {
584             // if we get here we have seen at least one version visible to the oldest scanner,
585             // which means we can prove that no scanner will see this version
586 
587             // false means there was a change, so give us the size.
588             long delta = heapSizeChange(cur, true);
589             addedSize -= delta;
590             activeSection.getHeapSize().addAndGet(-delta);
591             activeSection.getCellsCount().decrementAndGet();
592             if (removedCells != null) {
593               removedCells.add(cur);
594             }
595             it.remove();
596             setOldestEditTimeToNow();
597           } else {
598             versionsVisible++;
599           }
600         }
601       } else {
602         // past the row or column, done
603         break;
604       }
605     }
606     return addedSize;
607   }
608 
609   /*
610    * Immutable data structure to hold member found in set and the set it was
611    * found in. Include set because it is carrying context.
612    */
613   private static class Member {
614     final Cell cell;
615     final NavigableSet<Cell> set;
616     Member(final NavigableSet<Cell> s, final Cell kv) {
617       this.cell = kv;
618       this.set = s;
619     }
620   }
621 
622   /*
623    * @param set Set to walk back in.  Pass a first in row or we'll return
624    * same row (loop).
625    * @param state Utility and context.
626    * @param firstOnRow First item on the row after the one we want to find a
627    * member in.
628    * @return Null or member of row previous to <code>firstOnRow</code>
629    */
630   private Member memberOfPreviousRow(NavigableSet<Cell> set,
631       final GetClosestRowBeforeTracker state, final Cell firstOnRow) {
632     NavigableSet<Cell> head = set.headSet(firstOnRow, false);
633     if (head.isEmpty()) return null;
634     for (Iterator<Cell> i = head.descendingIterator(); i.hasNext();) {
635       Cell found = i.next();
636       if (state.isExpired(found)) {
637         i.remove();
638         continue;
639       }
640       return new Member(head, found);
641     }
642     return null;
643   }
644 
645   /**
646    * @return scanner on memstore and snapshot in this order.
647    */
648   @Override
649   public List<KeyValueScanner> getScanners(long readPt) {
650     MemStoreScanner scanner =
651       new MemStoreScanner(activeSection, snapshotSection, readPt, comparator);
652     scanner.seek(CellUtil.createCell(HConstants.EMPTY_START_ROW));
653     if (scanner.peek() == null) {
654       scanner.close();
655       return null;
656     }
657     return Collections.<KeyValueScanner> singletonList(scanner);
658   }
659 
660   /**
661    * Check if this memstore may contain the required keys
662    * @param scan scan
663    * @param store holds reference to cf
664    * @param oldestUnexpiredTS
665    * @return False if the key definitely does not exist in this Memstore
666    */
667   public boolean shouldSeek(Scan scan, Store store, long oldestUnexpiredTS) {
668     return shouldSeek(activeSection.getTimeRangeTracker(),
669         snapshotSection.getTimeRangeTracker(), scan, store, oldestUnexpiredTS);
670   }
671 
672   /**
673    * Check if this memstore may contain the required keys
674    * @param activeTimeRangeTracker the tracker of active data
675    * @param snapshotTimeRangeTracker the tracker of snapshot data
676    * @param scan scan
677    * @param store holds reference to cf
678    * @param oldestUnexpiredTS
679    * @return False if the key definitely does not exist in this Memstore
680    */
681   private static boolean shouldSeek(TimeRangeTracker activeTimeRangeTracker,
682       TimeRangeTracker snapshotTimeRangeTracker, Scan scan, Store store, long oldestUnexpiredTS) {
683     byte[] cf = store.getFamily().getName();
684     TimeRange timeRange = scan.getColumnFamilyTimeRange().get(cf);
685     if (timeRange == null) {
686       timeRange = scan.getTimeRange();
687     }
688     return (activeTimeRangeTracker.includesTimeRange(timeRange) ||
689       snapshotTimeRangeTracker.includesTimeRange(timeRange)) &&
690       (Math.max(activeTimeRangeTracker.getMax(), snapshotTimeRangeTracker.getMax()) >= oldestUnexpiredTS);
691   }
692 
693   /*
694    * MemStoreScanner implements the KeyValueScanner.
695    * It lets the caller scan the contents of a memstore -- both current
696    * map and snapshot.
697    * This behaves as if it were a real scanner but does not maintain position.
698    */
699   protected static class MemStoreScanner extends NonLazyKeyValueScanner {
700     // Next row information for either cellSet or snapshot
701     private Cell cellSetNextRow = null;
702     private Cell snapshotNextRow = null;
703 
704     // last iterated Cells for cellSet and snapshot (to restore iterator state after reseek)
705     private Cell cellSetItRow = null;
706     private Cell snapshotItRow = null;
707     
708     // iterator based scanning.
709     private Iterator<Cell> cellSetIt;
710     private Iterator<Cell> snapshotIt;
711 
712     // The cellSet and snapshot at the time of creating this scanner
713     private final Section activeAtCreation;
714     private final Section snapshotAtCreation;
715 
716     // the pre-calculated Cell to be returned by peek() or next()
717     private Cell theNext;
718 
719     // A flag represents whether could stop skipping Cells for MVCC
720     // if have encountered the next row. Only used for reversed scan
721     private boolean stopSkippingCellsIfNextRow = false;
722     // Stop skipping KeyValues for MVCC if finish this row. Only used for reversed scan
723     private Cell stopSkippingKVsRow;
724 
725     private final long readPoint;
726     private final KeyValue.KVComparator comparator;
727     /*
728     Some notes...
729 
730      So memstorescanner is fixed at creation time. this includes pointers/iterators into
731     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
732     snapshot is moved.  since kvset is null there is no point on reseeking on both,
733       we can save us the trouble. During the snapshot->hfile transition, the memstore
734       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
735       potentially do something smarter by adjusting the existing memstore scanner.
736 
737       But there is a greater problem here, that being once a scanner has progressed
738       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
739       if a scan lasts a little while, there is a chance for new entries in kvset to
740       become available but we will never see them.  This needs to be handled at the
741       StoreScanner level with coordination with MemStoreScanner.
742 
743       Currently, this problem is only partly managed: during the small amount of time
744       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
745       the adds to kvset in the MemStoreScanner.
746     */
747 
748     MemStoreScanner(Section activeSection, Section snapshotSection, long readPoint, final KeyValue.KVComparator c) {
749       this.readPoint = readPoint;
750       this.comparator = c;
751       activeAtCreation = activeSection;
752       snapshotAtCreation = snapshotSection;
753       if (activeAtCreation.getMemStoreLAB() != null) {
754         activeAtCreation.getMemStoreLAB().incScannerCount();
755       }
756       if (snapshotAtCreation.getMemStoreLAB() != null) {
757         snapshotAtCreation.getMemStoreLAB().incScannerCount();
758       }
759       if (Trace.isTracing() && Trace.currentSpan() != null) {
760         Trace.currentSpan().addTimelineAnnotation("Creating MemStoreScanner");
761       }
762     }
763 
764     /**
765      * Lock on 'this' must be held by caller.
766      * @param it
767      * @return Next Cell
768      */
769     private Cell getNext(Iterator<Cell> it) {
770       Cell v = null;
771       try {
772         while (it.hasNext()) {
773           v = it.next();
774           if (v.getSequenceId() <= this.readPoint) {
775             return v;
776           }
777           if (stopSkippingCellsIfNextRow && stopSkippingKVsRow != null
778               && comparator.compareRows(v, stopSkippingKVsRow) > 0) {
779             return null;
780           }
781         }
782 
783         return null;
784       } finally {
785         if (v != null) {
786           // in all cases, remember the last Cell iterated to
787           if (it == snapshotIt) {
788             snapshotItRow = v;
789           } else {
790             cellSetItRow = v;
791           }
792         }
793       }
794     }
795 
796     /**
797      *  Set the scanner at the seek key.
798      *  Must be called only once: there is no thread safety between the scanner
799      *   and the memStore.
800      * @param key seek value
801      * @return false if the key is null or if there is no data
802      */
803     @Override
804     public synchronized boolean seek(Cell key) {
805       if (key == null) {
806         close();
807         return false;
808       }
809       // kvset and snapshot will never be null.
810       // if tailSet can't find anything, SortedSet is empty (not null).
811       cellSetIt = activeAtCreation.getCellSkipListSet().tailSet(key).iterator();
812       snapshotIt = snapshotAtCreation.getCellSkipListSet().tailSet(key).iterator();
813       cellSetItRow = null;
814       snapshotItRow = null;
815 
816       return seekInSubLists(key);
817     }
818 
819 
820     /**
821      * (Re)initialize the iterators after a seek or a reseek.
822      */
823     private synchronized boolean seekInSubLists(Cell key){
824       cellSetNextRow = getNext(cellSetIt);
825       snapshotNextRow = getNext(snapshotIt);
826 
827       // Calculate the next value
828       theNext = getLowest(cellSetNextRow, snapshotNextRow);
829 
830       // has data
831       return (theNext != null);
832     }
833 
834 
835     /**
836      * Move forward on the sub-lists set previously by seek.
837      * @param key seek value (should be non-null)
838      * @return true if there is at least one KV to read, false otherwise
839      */
840     @Override
841     public synchronized boolean reseek(Cell key) {
842       /*
843       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
844       This code is executed concurrently with flush and puts, without locks.
845       Two points must be known when working on this code:
846       1) It's not possible to use the 'kvTail' and 'snapshot'
847        variables, as they are modified during a flush.
848       2) The ideal implementation for performance would use the sub skip list
849        implicitly pointed by the iterators 'kvsetIt' and
850        'snapshotIt'. Unfortunately the Java API does not offer a method to
851        get it. So we remember the last keys we iterated to and restore
852        the reseeked set to at least that point.
853        */
854       cellSetIt = activeAtCreation.getCellSkipListSet().tailSet(getHighest(key, cellSetItRow)).iterator();
855       snapshotIt = snapshotAtCreation.getCellSkipListSet().tailSet(getHighest(key, snapshotItRow)).iterator();
856 
857       return seekInSubLists(key);
858     }
859 
860 
861     @Override
862     public synchronized Cell peek() {
863       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
864       return theNext;
865     }
866 
867     @Override
868     public synchronized Cell next() {
869       if (theNext == null) {
870           return null;
871       }
872 
873       final Cell ret = theNext;
874 
875       // Advance one of the iterators
876       if (theNext == cellSetNextRow) {
877         cellSetNextRow = getNext(cellSetIt);
878       } else {
879         snapshotNextRow = getNext(snapshotIt);
880       }
881 
882       // Calculate the next value
883       theNext = getLowest(cellSetNextRow, snapshotNextRow);
884 
885       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
886       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
887       //    getLowest() + " threadpoint=" + readpoint);
888       return ret;
889     }
890 
891     /*
892      * Returns the lower of the two key values, or null if they are both null.
893      * This uses comparator.compare() to compare the KeyValue using the memstore
894      * comparator.
895      */
896     private Cell getLowest(Cell first, Cell second) {
897       if (first == null && second == null) {
898         return null;
899       }
900       if (first != null && second != null) {
901         int compare = comparator.compare(first, second);
902         return (compare <= 0 ? first : second);
903       }
904       return (first != null ? first : second);
905     }
906 
907     /*
908      * Returns the higher of the two cells, or null if they are both null.
909      * This uses comparator.compare() to compare the Cell using the memstore
910      * comparator.
911      */
912     private Cell getHighest(Cell first, Cell second) {
913       if (first == null && second == null) {
914         return null;
915       }
916       if (first != null && second != null) {
917         int compare = comparator.compare(first, second);
918         return (compare > 0 ? first : second);
919       }
920       return (first != null ? first : second);
921     }
922 
923     @Override
924     public synchronized void close() {
925       this.cellSetNextRow = null;
926       this.snapshotNextRow = null;
927 
928       this.cellSetIt = null;
929       this.snapshotIt = null;
930 
931       if (activeAtCreation != null && activeAtCreation.getMemStoreLAB() != null) {
932         activeAtCreation.getMemStoreLAB().decScannerCount();
933       }
934       if (snapshotAtCreation != null && snapshotAtCreation.getMemStoreLAB() != null) {
935         snapshotAtCreation.getMemStoreLAB().decScannerCount();
936       }
937 
938       this.cellSetItRow = null;
939       this.snapshotItRow = null;
940     }
941 
942     /**
943      * MemStoreScanner returns Long.MAX_VALUE because it will always have the latest data among all
944      * scanners.
945      * @see KeyValueScanner#getScannerOrder()
946      */
947     @Override
948     public long getScannerOrder() {
949       return Long.MAX_VALUE;
950     }
951 
952     @Override
953     public boolean shouldUseScanner(Scan scan, Store store, long oldestUnexpiredTS) {
954       return shouldSeek(activeAtCreation.getTimeRangeTracker(),
955         snapshotAtCreation.getTimeRangeTracker(), scan, store, oldestUnexpiredTS);
956     }
957 
958     /**
959      * Seek scanner to the given key first. If it returns false(means
960      * peek()==null) or scanner's peek row is bigger than row of given key, seek
961      * the scanner to the previous row of given key
962      */
963     @Override
964     public synchronized boolean backwardSeek(Cell key) {
965       seek(key);
966       if (peek() == null || comparator.compareRows(peek(), key) > 0) {
967         return seekToPreviousRow(key);
968       }
969       return true;
970     }
971 
972     /**
973      * Separately get the KeyValue before the specified key from kvset and
974      * snapshotset, and use the row of higher one as the previous row of
975      * specified key, then seek to the first KeyValue of previous row
976      */
977     @Override
978     public synchronized boolean seekToPreviousRow(Cell originalKey) {
979       boolean keepSeeking = false;
980       Cell key = originalKey;
981       do {
982         Cell firstKeyOnRow = KeyValueUtil.createFirstOnRow(key.getRowArray(), key.getRowOffset(),
983             key.getRowLength());
984         SortedSet<Cell> cellHead = activeAtCreation.getCellSkipListSet().headSet(firstKeyOnRow);
985         Cell cellSetBeforeRow = cellHead.isEmpty() ? null : cellHead.last();
986         SortedSet<Cell> snapshotHead = snapshotAtCreation.getCellSkipListSet()
987             .headSet(firstKeyOnRow);
988         Cell snapshotBeforeRow = snapshotHead.isEmpty() ? null : snapshotHead
989             .last();
990         Cell lastCellBeforeRow = getHighest(cellSetBeforeRow, snapshotBeforeRow);
991         if (lastCellBeforeRow == null) {
992           theNext = null;
993           return false;
994         }
995         Cell firstKeyOnPreviousRow = KeyValueUtil.createFirstOnRow(lastCellBeforeRow.getRowArray(),
996             lastCellBeforeRow.getRowOffset(), lastCellBeforeRow.getRowLength());
997         this.stopSkippingCellsIfNextRow = true;
998         this.stopSkippingKVsRow = firstKeyOnPreviousRow;
999         seek(firstKeyOnPreviousRow);
1000         this.stopSkippingCellsIfNextRow = false;
1001         if (peek() == null
1002             || comparator.compareRows(peek(), firstKeyOnPreviousRow) > 0) {
1003           keepSeeking = true;
1004           key = firstKeyOnPreviousRow;
1005           continue;
1006         } else {
1007           keepSeeking = false;
1008         }
1009       } while (keepSeeking);
1010       return true;
1011     }
1012 
1013     @Override
1014     public synchronized boolean seekToLastRow() {
1015       Cell first = activeAtCreation.getCellSkipListSet().isEmpty() ? null
1016           : activeAtCreation.getCellSkipListSet().last();
1017       Cell second = snapshotAtCreation.getCellSkipListSet().isEmpty() ? null
1018           : snapshotAtCreation.getCellSkipListSet().last();
1019       Cell higherCell = getHighest(first, second);
1020       if (higherCell == null) {
1021         return false;
1022       }
1023       Cell firstCellOnLastRow = KeyValueUtil.createFirstOnRow(higherCell.getRowArray(),
1024           higherCell.getRowOffset(), higherCell.getRowLength());
1025       if (seek(firstCellOnLastRow)) {
1026         return true;
1027       } else {
1028         return seekToPreviousRow(higherCell);
1029       }
1030 
1031     }
1032   }
1033 
1034   public final static long FIXED_OVERHEAD = ClassSize.align(ClassSize.OBJECT
1035       + (4 * ClassSize.REFERENCE) + (2 * Bytes.SIZEOF_LONG) + Bytes.SIZEOF_BOOLEAN);
1036 
1037   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
1038       (2 * ClassSize.ATOMIC_LONG) + (2 * ClassSize.TIMERANGE_TRACKER) +
1039       (2 * ClassSize.CELL_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP) +
1040       ClassSize.ATOMIC_INTEGER);
1041 
1042   /*
1043    * Calculate how the MemStore size has changed.  Includes overhead of the
1044    * backing Map.
1045    * @param cell
1046    * @param notpresent True if the cell was NOT present in the set.
1047    * @return Size
1048    */
1049   static long heapSizeChange(final Cell cell, final boolean notpresent) {
1050     return notpresent ? ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY
1051         + CellUtil.estimatedHeapSizeOf(cell)) : 0;
1052   }
1053 
1054   private long keySize() {
1055     return heapSize() - DEEP_OVERHEAD;
1056   }
1057 
1058   /**
1059    * Get the entire heap usage for this MemStore not including keys in the
1060    * snapshot.
1061    */
1062   @Override
1063   public long heapSize() {
1064     return activeSection.getHeapSize().get();
1065   }
1066 
1067   @Override
1068   public long size() {
1069     return heapSize();
1070   }
1071 
1072   /**
1073    * Code to help figure if our approximation of object heap sizes is close
1074    * enough.  See hbase-900.  Fills memstores then waits so user can heap
1075    * dump and bring up resultant hprof in something like jprofiler which
1076    * allows you get 'deep size' on objects.
1077    * @param args main args
1078    */
1079   public static void main(String [] args) {
1080     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
1081     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
1082       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
1083     LOG.info("vmInputArguments=" + runtime.getInputArguments());
1084     DefaultMemStore memstore1 = new DefaultMemStore();
1085     // TODO: x32 vs x64
1086     long size = 0;
1087     final int count = 10000;
1088     byte [] fam = Bytes.toBytes("col");
1089     byte [] qf = Bytes.toBytes("umn");
1090     byte [] empty = new byte[0];
1091     for (int i = 0; i < count; i++) {
1092       // Give each its own ts
1093       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1094     }
1095     LOG.info("memstore1 estimated size=" + size);
1096     for (int i = 0; i < count; i++) {
1097       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1098     }
1099     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1100     // Make a variably sized memstore.
1101     DefaultMemStore memstore2 = new DefaultMemStore();
1102     for (int i = 0; i < count; i++) {
1103       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, new byte[i]));
1104     }
1105     LOG.info("memstore2 estimated size=" + size);
1106     final int seconds = 30;
1107     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1108     for (int i = 0; i < seconds; i++) {
1109       // Thread.sleep(1000);
1110     }
1111     LOG.info("Exiting.");
1112   }
1113 
1114   /**
1115    * Contains the fields which are useful to MemStoreScanner.
1116    */
1117   @InterfaceAudience.Private
1118   static class Section {
1119     /**
1120      * MemStore.  Use a CellSkipListSet rather than SkipListSet because of the
1121      * better semantics.  The Map will overwrite if passed a key it already had
1122      * whereas the Set will not add new Cell if key is same though value might be
1123      * different.  Value is not important -- just make sure always same reference passed.
1124      */
1125     private final CellSkipListSet cellSet;
1126     private final TimeRangeTracker tracker = new TimeRangeTracker();
1127     /**
1128      * Used to track own heapSize.
1129      */
1130     private final AtomicLong heapSize;
1131     private final AtomicInteger cellCount;
1132     private final MemStoreLAB allocator;
1133 
1134     static Section newSnapshotSection(final KeyValue.KVComparator c) {
1135       return new Section(c, null, 0);
1136     }
1137 
1138     static Section newActiveSection(final KeyValue.KVComparator c,
1139             final Configuration conf) {
1140       return new Section(c, conf, DEEP_OVERHEAD);
1141     }
1142 
1143     private Section(final KeyValue.KVComparator c,
1144             final Configuration conf, long initHeapSize) {
1145       this.cellSet = new CellSkipListSet(c);
1146       this.heapSize = new AtomicLong(initHeapSize);
1147       this.cellCount = new AtomicInteger(0);
1148       if (conf != null && conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
1149         String className = conf.get(MSLAB_CLASS_NAME, HeapMemStoreLAB.class.getName());
1150         this.allocator = ReflectionUtils.instantiateWithCustomCtor(className,
1151                 new Class[]{Configuration.class}, new Object[]{conf});
1152       } else {
1153         this.allocator = null;
1154       }
1155     }
1156 
1157     CellSkipListSet getCellSkipListSet() {
1158       return cellSet;
1159     }
1160 
1161     TimeRangeTracker getTimeRangeTracker() {
1162       return tracker;
1163     }
1164 
1165     AtomicLong getHeapSize() {
1166       return heapSize;
1167     }
1168 
1169     AtomicInteger getCellsCount() {
1170       return cellCount;
1171     }
1172 
1173     MemStoreLAB getMemStoreLAB() {
1174       return allocator;
1175     }
1176   }
1177 }