1 /*
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.apache.hadoop.hbase.io.hfile;
20
21 import java.io.ByteArrayOutputStream;
22 import java.io.DataInput;
23 import java.io.DataInputStream;
24 import java.io.DataOutput;
25 import java.io.DataOutputStream;
26 import java.io.IOException;
27 import java.nio.ByteBuffer;
28 import java.util.ArrayList;
29 import java.util.Collections;
30 import java.util.List;
31 import java.util.concurrent.atomic.AtomicReference;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.conf.Configuration;
36 import org.apache.hadoop.fs.FSDataOutputStream;
37 import org.apache.hadoop.hbase.Cell;
38 import org.apache.hadoop.hbase.KeyValue;
39 import org.apache.hadoop.hbase.KeyValue.KVComparator;
40 import org.apache.hadoop.hbase.KeyValueUtil;
41 import org.apache.hadoop.hbase.classification.InterfaceAudience;
42 import org.apache.hadoop.hbase.io.HeapSize;
43 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
44 import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader;
45 import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
46 import org.apache.hadoop.hbase.util.ByteBufferUtils;
47 import org.apache.hadoop.hbase.util.Bytes;
48 import org.apache.hadoop.hbase.util.ClassSize;
49 import org.apache.hadoop.io.WritableUtils;
50 import org.apache.hadoop.util.StringUtils;
51
52 /**
53 * Provides functionality to write ({@link BlockIndexWriter}) and read
54 * BlockIndexReader
55 * single-level and multi-level block indexes.
56 *
57 * Examples of how to use the block index writer can be found in
58 * {@link org.apache.hadoop.hbase.util.CompoundBloomFilterWriter} and
59 * {@link HFileWriterV2}. Examples of how to use the reader can be
60 * found in {@link HFileReaderV2} and
61 * {@link org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex}.
62 */
63 @InterfaceAudience.Private
64 public class HFileBlockIndex {
65
66 private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class);
67
68 static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;
69
70 /**
71 * The maximum size guideline for index blocks (both leaf, intermediate, and
72 * root). If not specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used.
73 */
74 public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";
75
76 /**
77 * Minimum number of entries in a single index block. Even if we are above the
78 * hfile.index.block.max.size we will keep writing to the same block unless we have that many
79 * entries. We should have at least a few entries so that we don't have too many levels in the
80 * multi-level index. This should be at least 2 to make sure there is no infinite recursion.
81 */
82 public static final String MIN_INDEX_NUM_ENTRIES_KEY = "hfile.index.block.min.entries";
83
84 static final int DEFAULT_MIN_INDEX_NUM_ENTRIES = 16;
85
86 /**
87 * The number of bytes stored in each "secondary index" entry in addition to
88 * key bytes in the non-root index block format. The first long is the file
89 * offset of the deeper-level block the entry points to, and the int that
90 * follows is that block's on-disk size without including header.
91 */
92 static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT
93 + Bytes.SIZEOF_LONG;
94
95 /**
96 * Error message when trying to use inline block API in single-level mode.
97 */
98 private static final String INLINE_BLOCKS_NOT_ALLOWED =
99 "Inline blocks are not allowed in the single-level-only mode";
100
101 /**
102 * The size of a meta-data record used for finding the mid-key in a
103 * multi-level index. Consists of the middle leaf-level index block offset
104 * (long), its on-disk size without header included (int), and the mid-key
105 * entry's zero-based index in that leaf index block.
106 */
107 private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG +
108 2 * Bytes.SIZEOF_INT;
109
110 /**
111 * The reader will always hold the root level index in the memory. Index
112 * blocks at all other levels will be cached in the LRU cache in practice,
113 * although this API does not enforce that.
114 *
115 * <p>All non-root (leaf and intermediate) index blocks contain what we call a
116 * "secondary index": an array of offsets to the entries within the block.
117 * This allows us to do binary search for the entry corresponding to the
118 * given key without having to deserialize the block.
119 */
120 public static class BlockIndexReader implements HeapSize {
121 /** Needed doing lookup on blocks. */
122 private final KVComparator comparator;
123
124 // Root-level data.
125 private byte[][] blockKeys;
126 private long[] blockOffsets;
127 private int[] blockDataSizes;
128 private int rootCount = 0;
129
130 // Mid-key metadata.
131 private long midLeafBlockOffset = -1;
132 private int midLeafBlockOnDiskSize = -1;
133 private int midKeyEntry = -1;
134
135 /** Pre-computed mid-key */
136 private AtomicReference<byte[]> midKey = new AtomicReference<byte[]>();
137
138 /**
139 * The number of levels in the block index tree. One if there is only root
140 * level, two for root and leaf levels, etc.
141 */
142 private int searchTreeLevel;
143
144 /** A way to read {@link HFile} blocks at a given offset */
145 private CachingBlockReader cachingBlockReader;
146
147 public BlockIndexReader(final KVComparator c, final int treeLevel,
148 final CachingBlockReader cachingBlockReader) {
149 this(c, treeLevel);
150 this.cachingBlockReader = cachingBlockReader;
151 }
152
153 public BlockIndexReader(final KVComparator c, final int treeLevel)
154 {
155 comparator = c;
156 searchTreeLevel = treeLevel;
157 }
158
159 /**
160 * @return true if the block index is empty.
161 */
162 public boolean isEmpty() {
163 return blockKeys.length == 0;
164 }
165
166 /**
167 * Verifies that the block index is non-empty and throws an
168 * {@link IllegalStateException} otherwise.
169 */
170 public void ensureNonEmpty() {
171 if (blockKeys.length == 0) {
172 throw new IllegalStateException("Block index is empty or not loaded");
173 }
174 }
175
176 /**
177 * Return the data block which contains this key. This function will only
178 * be called when the HFile version is larger than 1.
179 *
180 * @param key the key we are looking for
181 * @param currentBlock the current block, to avoid re-reading the same block
182 * @param cacheBlocks
183 * @param pread
184 * @param isCompaction
185 * @param expectedDataBlockEncoding the data block encoding the caller is
186 * expecting the data block to be in, or null to not perform this
187 * check and return the block irrespective of the encoding
188 * @return reader a basic way to load blocks
189 * @throws IOException
190 */
191 public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks,
192 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
193 throws IOException {
194 BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock,
195 cacheBlocks,
196 pread, isCompaction, expectedDataBlockEncoding);
197 if (blockWithScanInfo == null) {
198 return null;
199 } else {
200 return blockWithScanInfo.getHFileBlock();
201 }
202 }
203
204 /**
205 * Return the BlockWithScanInfo, a data structure which contains the Data HFileBlock with
206 * other scan info such as the key that starts the next HFileBlock. This function will only
207 * be called when the HFile version is larger than 1.
208 *
209 * @param key the key we are looking for
210 * @param currentBlock the current block, to avoid re-reading the same block
211 * @param cacheBlocks
212 * @param pread
213 * @param isCompaction
214 * @param expectedDataBlockEncoding the data block encoding the caller is
215 * expecting the data block to be in, or null to not perform this
216 * check and return the block irrespective of the encoding.
217 * @return the BlockWithScanInfo which contains the DataBlock with other
218 * scan info such as nextIndexedKey.
219 * @throws IOException
220 */
221 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
222 boolean cacheBlocks,
223 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
224 throws IOException {
225 int rootLevelIndex = rootBlockContainingKey(key);
226 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
227 return null;
228 }
229
230 // the next indexed key
231 Cell nextIndexedKey = null;
232
233 // Read the next-level (intermediate or leaf) index block.
234 long currentOffset = blockOffsets[rootLevelIndex];
235 int currentOnDiskSize = blockDataSizes[rootLevelIndex];
236
237 if (rootLevelIndex < blockKeys.length - 1) {
238 nextIndexedKey = new KeyValue.KeyOnlyKeyValue(blockKeys[rootLevelIndex + 1]);
239 } else {
240 nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY;
241 }
242
243 int lookupLevel = 1; // How many levels deep we are in our lookup.
244 int index = -1;
245
246 HFileBlock block;
247 while (true) {
248
249 if (currentBlock != null && currentBlock.getOffset() == currentOffset)
250 {
251 // Avoid reading the same block again, even with caching turned off.
252 // This is crucial for compaction-type workload which might have
253 // caching turned off. This is like a one-block cache inside the
254 // scanner.
255 block = currentBlock;
256 } else {
257 // Call HFile's caching block reader API. We always cache index
258 // blocks, otherwise we might get terrible performance.
259 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
260 BlockType expectedBlockType;
261 if (lookupLevel < searchTreeLevel - 1) {
262 expectedBlockType = BlockType.INTERMEDIATE_INDEX;
263 } else if (lookupLevel == searchTreeLevel - 1) {
264 expectedBlockType = BlockType.LEAF_INDEX;
265 } else {
266 // this also accounts for ENCODED_DATA
267 expectedBlockType = BlockType.DATA;
268 }
269 block = cachingBlockReader.readBlock(currentOffset,
270 currentOnDiskSize, shouldCache, pread, isCompaction, true,
271 expectedBlockType, expectedDataBlockEncoding);
272 }
273
274 if (block == null) {
275 throw new IOException("Failed to read block at offset " +
276 currentOffset + ", onDiskSize=" + currentOnDiskSize);
277 }
278
279 // Found a data block, break the loop and check our level in the tree.
280 if (block.getBlockType().isData()) {
281 break;
282 }
283
284 // Not a data block. This must be a leaf-level or intermediate-level
285 // index block. We don't allow going deeper than searchTreeLevel.
286 if (++lookupLevel > searchTreeLevel) {
287 throw new IOException("Search Tree Level overflow: lookupLevel="+
288 lookupLevel + ", searchTreeLevel=" + searchTreeLevel);
289 }
290
291 // Locate the entry corresponding to the given key in the non-root
292 // (leaf or intermediate-level) index block.
293 ByteBuffer buffer = block.getBufferWithoutHeader();
294 index = locateNonRootIndexEntry(buffer, key, comparator);
295 if (index == -1) {
296 // This has to be changed
297 // For now change this to key value
298 KeyValue kv = KeyValueUtil.ensureKeyValue(key);
299 throw new IOException("The key "
300 + Bytes.toStringBinary(kv.getKey(), kv.getKeyOffset(), kv.getKeyLength())
301 + " is before the" + " first key of the non-root index block "
302 + block);
303 }
304
305 currentOffset = buffer.getLong();
306 currentOnDiskSize = buffer.getInt();
307
308 // Only update next indexed key if there is a next indexed key in the current level
309 byte[] tmpNextIndexedKey = getNonRootIndexedKey(buffer, index + 1);
310 if (tmpNextIndexedKey != null) {
311 nextIndexedKey = new KeyValue.KeyOnlyKeyValue(tmpNextIndexedKey);
312 }
313 }
314
315 if (lookupLevel != searchTreeLevel) {
316 throw new IOException("Reached a data block at level " + lookupLevel +
317 " but the number of levels is " + searchTreeLevel);
318 }
319
320 // set the next indexed key for the current block.
321 BlockWithScanInfo blockWithScanInfo = new BlockWithScanInfo(block, nextIndexedKey);
322 return blockWithScanInfo;
323 }
324
325 /**
326 * An approximation to the {@link HFile}'s mid-key. Operates on block
327 * boundaries, and does not go inside blocks. In other words, returns the
328 * first key of the middle block of the file.
329 *
330 * @return the first key of the middle block
331 */
332 public byte[] midkey() throws IOException {
333 if (rootCount == 0)
334 throw new IOException("HFile empty");
335
336 byte[] targetMidKey = this.midKey.get();
337 if (targetMidKey != null) {
338 return targetMidKey;
339 }
340
341 if (midLeafBlockOffset >= 0) {
342 if (cachingBlockReader == null) {
343 throw new IOException("Have to read the middle leaf block but " +
344 "no block reader available");
345 }
346
347 // Caching, using pread, assuming this is not a compaction.
348 HFileBlock midLeafBlock = cachingBlockReader.readBlock(
349 midLeafBlockOffset, midLeafBlockOnDiskSize, true, true, false, true,
350 BlockType.LEAF_INDEX, null);
351
352 ByteBuffer b = midLeafBlock.getBufferWithoutHeader();
353 int numDataBlocks = b.getInt();
354 int keyRelOffset = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 1));
355 int keyLen = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 2)) -
356 keyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
357 int keyOffset = Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset
358 + SECONDARY_INDEX_ENTRY_OVERHEAD;
359 targetMidKey = ByteBufferUtils.toBytes(b, keyOffset, keyLen);
360 } else {
361 // The middle of the root-level index.
362 targetMidKey = blockKeys[rootCount / 2];
363 }
364
365 this.midKey.set(targetMidKey);
366 return targetMidKey;
367 }
368
369 /**
370 * @param i from 0 to {@link #getRootBlockCount() - 1}
371 */
372 public byte[] getRootBlockKey(int i) {
373 return blockKeys[i];
374 }
375
376 /**
377 * @param i from 0 to {@link #getRootBlockCount() - 1}
378 */
379 public long getRootBlockOffset(int i) {
380 return blockOffsets[i];
381 }
382
383 /**
384 * @param i zero-based index of a root-level block
385 * @return the on-disk size of the root-level block for version 2, or the
386 * uncompressed size for version 1
387 */
388 public int getRootBlockDataSize(int i) {
389 return blockDataSizes[i];
390 }
391
392 /**
393 * @return the number of root-level blocks in this block index
394 */
395 public int getRootBlockCount() {
396 return rootCount;
397 }
398
399 /**
400 * Finds the root-level index block containing the given key.
401 *
402 * @param key
403 * Key to find
404 * @return Offset of block containing <code>key</code> (between 0 and the
405 * number of blocks - 1) or -1 if this file does not contain the
406 * request.
407 */
408 public int rootBlockContainingKey(final byte[] key, int offset, int length) {
409 int pos = Bytes.binarySearch(blockKeys, key, offset, length, comparator);
410 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
411 // binarySearch's javadoc.
412
413 if (pos >= 0) {
414 // This means this is an exact match with an element of blockKeys.
415 assert pos < blockKeys.length;
416 return pos;
417 }
418
419 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
420 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
421 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
422 // key < blockKeys[0], meaning the file does not contain the given key.
423
424 int i = -pos - 1;
425 assert 0 <= i && i <= blockKeys.length;
426 return i - 1;
427 }
428
429 /**
430 * Finds the root-level index block containing the given key.
431 *
432 * @param key
433 * Key to find
434 */
435 public int rootBlockContainingKey(final Cell key) {
436 int pos = Bytes.binarySearch(blockKeys, key, comparator);
437 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
438 // binarySearch's javadoc.
439
440 if (pos >= 0) {
441 // This means this is an exact match with an element of blockKeys.
442 assert pos < blockKeys.length;
443 return pos;
444 }
445
446 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
447 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
448 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
449 // key < blockKeys[0], meaning the file does not contain the given key.
450
451 int i = -pos - 1;
452 assert 0 <= i && i <= blockKeys.length;
453 return i - 1;
454 }
455
456 /**
457 * Adds a new entry in the root block index. Only used when reading.
458 *
459 * @param key Last key in the block
460 * @param offset file offset where the block is stored
461 * @param dataSize the uncompressed data size
462 */
463 private void add(final byte[] key, final long offset, final int dataSize) {
464 blockOffsets[rootCount] = offset;
465 blockKeys[rootCount] = key;
466 blockDataSizes[rootCount] = dataSize;
467 rootCount++;
468 }
469
470 /**
471 * The indexed key at the ith position in the nonRootIndex. The position starts at 0.
472 * @param nonRootIndex
473 * @param i the ith position
474 * @return The indexed key at the ith position in the nonRootIndex.
475 */
476 private byte[] getNonRootIndexedKey(ByteBuffer nonRootIndex, int i) {
477 int numEntries = nonRootIndex.getInt(0);
478 if (i < 0 || i >= numEntries) {
479 return null;
480 }
481
482 // Entries start after the number of entries and the secondary index.
483 // The secondary index takes numEntries + 1 ints.
484 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
485 // Targetkey's offset relative to the end of secondary index
486 int targetKeyRelOffset = nonRootIndex.getInt(
487 Bytes.SIZEOF_INT * (i + 1));
488
489 // The offset of the target key in the blockIndex buffer
490 int targetKeyOffset = entriesOffset // Skip secondary index
491 + targetKeyRelOffset // Skip all entries until mid
492 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
493
494 // We subtract the two consecutive secondary index elements, which
495 // gives us the size of the whole (offset, onDiskSize, key) tuple. We
496 // then need to subtract the overhead of offset and onDiskSize.
497 int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) -
498 targetKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
499
500 return ByteBufferUtils.toBytes(nonRootIndex, targetKeyOffset, targetKeyLength);
501 }
502
503 /**
504 * Performs a binary search over a non-root level index block. Utilizes the
505 * secondary index, which records the offsets of (offset, onDiskSize,
506 * firstKey) tuples of all entries.
507 *
508 * @param key
509 * the key we are searching for offsets to individual entries in
510 * the blockIndex buffer
511 * @param nonRootIndex
512 * the non-root index block buffer, starting with the secondary
513 * index. The position is ignored.
514 * @return the index i in [0, numEntries - 1] such that keys[i] <= key <
515 * keys[i + 1], if keys is the array of all keys being searched, or
516 * -1 otherwise
517 * @throws IOException
518 */
519 static int binarySearchNonRootIndex(Cell key, ByteBuffer nonRootIndex,
520 KVComparator comparator) {
521
522 int numEntries = nonRootIndex.getInt(0);
523 int low = 0;
524 int high = numEntries - 1;
525 int mid = 0;
526
527 // Entries start after the number of entries and the secondary index.
528 // The secondary index takes numEntries + 1 ints.
529 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
530
531 // If we imagine that keys[-1] = -Infinity and
532 // keys[numEntries] = Infinity, then we are maintaining an invariant that
533 // keys[low - 1] < key < keys[high + 1] while narrowing down the range.
534 KeyValue.KeyOnlyKeyValue nonRootIndexKV = new KeyValue.KeyOnlyKeyValue();
535 while (low <= high) {
536 mid = low + ((high - low) >> 1);
537
538 // Midkey's offset relative to the end of secondary index
539 int midKeyRelOffset = nonRootIndex.getInt(
540 Bytes.SIZEOF_INT * (mid + 1));
541
542 // The offset of the middle key in the blockIndex buffer
543 int midKeyOffset = entriesOffset // Skip secondary index
544 + midKeyRelOffset // Skip all entries until mid
545 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
546
547 // We subtract the two consecutive secondary index elements, which
548 // gives us the size of the whole (offset, onDiskSize, key) tuple. We
549 // then need to subtract the overhead of offset and onDiskSize.
550 int midLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (mid + 2)) -
551 midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
552
553 // we have to compare in this order, because the comparator order
554 // has special logic when the 'left side' is a special key.
555 // TODO make KeyOnlyKeyValue to be Buffer backed and avoid array() call. This has to be
556 // done after HBASE-12224 & HBASE-12282
557 nonRootIndexKV.setKey(nonRootIndex.array(),
558 nonRootIndex.arrayOffset() + midKeyOffset, midLength);
559 int cmp = comparator.compareOnlyKeyPortion(key, nonRootIndexKV);
560
561 // key lives above the midpoint
562 if (cmp > 0)
563 low = mid + 1; // Maintain the invariant that keys[low - 1] < key
564 // key lives below the midpoint
565 else if (cmp < 0)
566 high = mid - 1; // Maintain the invariant that key < keys[high + 1]
567 else
568 return mid; // exact match
569 }
570
571 // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning
572 // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break
573 // condition, low >= high + 1. Therefore, low = high + 1.
574
575 if (low != high + 1) {
576 throw new IllegalStateException("Binary search broken: low=" + low
577 + " " + "instead of " + (high + 1));
578 }
579
580 // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to
581 // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1.
582 int i = low - 1;
583
584 // Some extra validation on the result.
585 if (i < -1 || i >= numEntries) {
586 throw new IllegalStateException("Binary search broken: result is " +
587 i + " but expected to be between -1 and (numEntries - 1) = " +
588 (numEntries - 1));
589 }
590
591 return i;
592 }
593
594 /**
595 * Search for one key using the secondary index in a non-root block. In case
596 * of success, positions the provided buffer at the entry of interest, where
597 * the file offset and the on-disk-size can be read.
598 *
599 * @param nonRootBlock
600 * a non-root block without header. Initial position does not
601 * matter.
602 * @param key
603 * the byte array containing the key
604 * @return the index position where the given key was found, otherwise
605 * return -1 in the case the given key is before the first key.
606 *
607 */
608 static int locateNonRootIndexEntry(ByteBuffer nonRootBlock, Cell key,
609 KVComparator comparator) {
610 int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator);
611
612 if (entryIndex != -1) {
613 int numEntries = nonRootBlock.getInt(0);
614
615 // The end of secondary index and the beginning of entries themselves.
616 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
617
618 // The offset of the entry we are interested in relative to the end of
619 // the secondary index.
620 int entryRelOffset = nonRootBlock.getInt(Bytes.SIZEOF_INT * (1 + entryIndex));
621
622 nonRootBlock.position(entriesOffset + entryRelOffset);
623 }
624
625 return entryIndex;
626 }
627
628 /**
629 * Read in the root-level index from the given input stream. Must match
630 * what was written into the root level by
631 * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
632 * offset that function returned.
633 *
634 * @param in the buffered input stream or wrapped byte input stream
635 * @param numEntries the number of root-level index entries
636 * @throws IOException
637 */
638 public void readRootIndex(DataInput in, final int numEntries)
639 throws IOException {
640 blockOffsets = new long[numEntries];
641 blockKeys = new byte[numEntries][];
642 blockDataSizes = new int[numEntries];
643
644 // If index size is zero, no index was written.
645 if (numEntries > 0) {
646 for (int i = 0; i < numEntries; ++i) {
647 long offset = in.readLong();
648 int dataSize = in.readInt();
649 byte[] key = Bytes.readByteArray(in);
650 add(key, offset, dataSize);
651 }
652 }
653 }
654
655 /**
656 * Read in the root-level index from the given input stream. Must match
657 * what was written into the root level by
658 * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
659 * offset that function returned.
660 *
661 * @param blk the HFile block
662 * @param numEntries the number of root-level index entries
663 * @return the buffered input stream or wrapped byte input stream
664 * @throws IOException
665 */
666 public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException {
667 DataInputStream in = blk.getByteStream();
668 readRootIndex(in, numEntries);
669 return in;
670 }
671
672 /**
673 * Read the root-level metadata of a multi-level block index. Based on
674 * {@link #readRootIndex(DataInput, int)}, but also reads metadata
675 * necessary to compute the mid-key in a multi-level index.
676 *
677 * @param blk the HFile block
678 * @param numEntries the number of root-level index entries
679 * @throws IOException
680 */
681 public void readMultiLevelIndexRoot(HFileBlock blk,
682 final int numEntries) throws IOException {
683 DataInputStream in = readRootIndex(blk, numEntries);
684 // after reading the root index the checksum bytes have to
685 // be subtracted to know if the mid key exists.
686 int checkSumBytes = blk.totalChecksumBytes();
687 if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) {
688 // No mid-key metadata available.
689 return;
690 }
691 midLeafBlockOffset = in.readLong();
692 midLeafBlockOnDiskSize = in.readInt();
693 midKeyEntry = in.readInt();
694 }
695
696 @Override
697 public String toString() {
698 StringBuilder sb = new StringBuilder();
699 sb.append("size=" + rootCount).append("\n");
700 for (int i = 0; i < rootCount; i++) {
701 sb.append("key=").append(KeyValue.keyToString(blockKeys[i]))
702 .append("\n offset=").append(blockOffsets[i])
703 .append(", dataSize=" + blockDataSizes[i]).append("\n");
704 }
705 return sb.toString();
706 }
707
708 @Override
709 public long heapSize() {
710 long heapSize = ClassSize.align(6 * ClassSize.REFERENCE +
711 2 * Bytes.SIZEOF_INT + ClassSize.OBJECT);
712
713 // Mid-key metadata.
714 heapSize += MID_KEY_METADATA_SIZE;
715
716 // Calculating the size of blockKeys
717 if (blockKeys != null) {
718 // Adding array + references overhead
719 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length
720 * ClassSize.REFERENCE);
721
722 // Adding bytes
723 for (byte[] key : blockKeys) {
724 heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
725 }
726 }
727
728 if (blockOffsets != null) {
729 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length
730 * Bytes.SIZEOF_LONG);
731 }
732
733 if (blockDataSizes != null) {
734 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length
735 * Bytes.SIZEOF_INT);
736 }
737
738 return ClassSize.align(heapSize);
739 }
740
741 }
742
743 /**
744 * Writes the block index into the output stream. Generate the tree from
745 * bottom up. The leaf level is written to disk as a sequence of inline
746 * blocks, if it is larger than a certain number of bytes. If the leaf level
747 * is not large enough, we write all entries to the root level instead.
748 *
749 * After all leaf blocks have been written, we end up with an index
750 * referencing the resulting leaf index blocks. If that index is larger than
751 * the allowed root index size, the writer will break it up into
752 * reasonable-size intermediate-level index block chunks write those chunks
753 * out, and create another index referencing those chunks. This will be
754 * repeated until the remaining index is small enough to become the root
755 * index. However, in most practical cases we will only have leaf-level
756 * blocks and the root index, or just the root index.
757 */
758 public static class BlockIndexWriter implements InlineBlockWriter {
759 /**
760 * While the index is being written, this represents the current block
761 * index referencing all leaf blocks, with one exception. If the file is
762 * being closed and there are not enough blocks to complete even a single
763 * leaf block, no leaf blocks get written and this contains the entire
764 * block index. After all levels of the index were written by
765 * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final
766 * root-level index.
767 */
768 private BlockIndexChunk rootChunk = new BlockIndexChunk();
769
770 /**
771 * Current leaf-level chunk. New entries referencing data blocks get added
772 * to this chunk until it grows large enough to be written to disk.
773 */
774 private BlockIndexChunk curInlineChunk = new BlockIndexChunk();
775
776 /**
777 * The number of block index levels. This is one if there is only root
778 * level (even empty), two if there a leaf level and root level, and is
779 * higher if there are intermediate levels. This is only final after
780 * {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The
781 * initial value accounts for the root level, and will be increased to two
782 * as soon as we find out there is a leaf-level in
783 * {@link #blockWritten(long, int, int)}.
784 */
785 private int numLevels = 1;
786
787 private HFileBlock.Writer blockWriter;
788 private byte[] firstKey = null;
789
790 /**
791 * The total number of leaf-level entries, i.e. entries referenced by
792 * leaf-level blocks. For the data block index this is equal to the number
793 * of data blocks.
794 */
795 private long totalNumEntries;
796
797 /** Total compressed size of all index blocks. */
798 private long totalBlockOnDiskSize;
799
800 /** Total uncompressed size of all index blocks. */
801 private long totalBlockUncompressedSize;
802
803 /** The maximum size guideline of all multi-level index blocks. */
804 private int maxChunkSize;
805
806 /** The maximum level of multi-level index blocks */
807 private int minIndexNumEntries;
808
809 /** Whether we require this block index to always be single-level. */
810 private boolean singleLevelOnly;
811
812 /** CacheConfig, or null if cache-on-write is disabled */
813 private CacheConfig cacheConf;
814
815 /** Name to use for computing cache keys */
816 private String nameForCaching;
817
818 /** Creates a single-level block index writer */
819 public BlockIndexWriter() {
820 this(null, null, null);
821 singleLevelOnly = true;
822 }
823
824 /**
825 * Creates a multi-level block index writer.
826 *
827 * @param blockWriter the block writer to use to write index blocks
828 * @param cacheConf used to determine when and how a block should be cached-on-write.
829 */
830 public BlockIndexWriter(HFileBlock.Writer blockWriter,
831 CacheConfig cacheConf, String nameForCaching) {
832 if ((cacheConf == null) != (nameForCaching == null)) {
833 throw new IllegalArgumentException("Block cache and file name for " +
834 "caching must be both specified or both null");
835 }
836
837 this.blockWriter = blockWriter;
838 this.cacheConf = cacheConf;
839 this.nameForCaching = nameForCaching;
840 this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
841 this.minIndexNumEntries = HFileBlockIndex.DEFAULT_MIN_INDEX_NUM_ENTRIES;
842 }
843
844 public void setMaxChunkSize(int maxChunkSize) {
845 if (maxChunkSize <= 0) {
846 throw new IllegalArgumentException("Invalid maximum index block size");
847 }
848 this.maxChunkSize = maxChunkSize;
849 }
850
851 public void setMinIndexNumEntries(int minIndexNumEntries) {
852 if (minIndexNumEntries <= 1) {
853 throw new IllegalArgumentException("Invalid maximum index level, should be >= 2");
854 }
855 this.minIndexNumEntries = minIndexNumEntries;
856 }
857
858 /**
859 * Writes the root level and intermediate levels of the block index into
860 * the output stream, generating the tree from bottom up. Assumes that the
861 * leaf level has been inline-written to the disk if there is enough data
862 * for more than one leaf block. We iterate by breaking the current level
863 * of the block index, starting with the index of all leaf-level blocks,
864 * into chunks small enough to be written to disk, and generate its parent
865 * level, until we end up with a level small enough to become the root
866 * level.
867 *
868 * If the leaf level is not large enough, there is no inline block index
869 * anymore, so we only write that level of block index to disk as the root
870 * level.
871 *
872 * @param out FSDataOutputStream
873 * @return position at which we entered the root-level index.
874 * @throws IOException
875 */
876 public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
877 if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) {
878 throw new IOException("Trying to write a multi-level block index, " +
879 "but are " + curInlineChunk.getNumEntries() + " entries in the " +
880 "last inline chunk.");
881 }
882
883 // We need to get mid-key metadata before we create intermediate
884 // indexes and overwrite the root chunk.
885 byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata()
886 : null;
887
888 if (curInlineChunk != null) {
889 while (rootChunk.getRootSize() > maxChunkSize
890 // HBASE-16288: if firstKey is larger than maxChunkSize we will loop indefinitely
891 && rootChunk.getNumEntries() > minIndexNumEntries
892 // Sanity check. We will not hit this (minIndexNumEntries ^ 16) blocks can be addressed
893 && numLevels < 16) {
894 rootChunk = writeIntermediateLevel(out, rootChunk);
895 numLevels += 1;
896 }
897 }
898
899 // write the root level
900 long rootLevelIndexPos = out.getPos();
901
902 {
903 DataOutput blockStream =
904 blockWriter.startWriting(BlockType.ROOT_INDEX);
905 rootChunk.writeRoot(blockStream);
906 if (midKeyMetadata != null)
907 blockStream.write(midKeyMetadata);
908 blockWriter.writeHeaderAndData(out);
909 if (cacheConf != null) {
910 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
911 cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching,
912 rootLevelIndexPos, true, blockForCaching.getBlockType()), blockForCaching);
913 }
914 }
915
916 // Add root index block size
917 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
918 totalBlockUncompressedSize +=
919 blockWriter.getUncompressedSizeWithoutHeader();
920
921 if (LOG.isTraceEnabled()) {
922 LOG.trace("Wrote a " + numLevels + "-level index with root level at pos "
923 + rootLevelIndexPos + ", " + rootChunk.getNumEntries()
924 + " root-level entries, " + totalNumEntries + " total entries, "
925 + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) +
926 " on-disk size, "
927 + StringUtils.humanReadableInt(totalBlockUncompressedSize) +
928 " total uncompressed size.");
929 }
930 return rootLevelIndexPos;
931 }
932
933 /**
934 * Writes the block index data as a single level only. Does not do any
935 * block framing.
936 *
937 * @param out the buffered output stream to write the index to. Typically a
938 * stream writing into an {@link HFile} block.
939 * @param description a short description of the index being written. Used
940 * in a log message.
941 * @throws IOException
942 */
943 public void writeSingleLevelIndex(DataOutput out, String description)
944 throws IOException {
945 expectNumLevels(1);
946
947 if (!singleLevelOnly)
948 throw new IOException("Single-level mode is turned off");
949
950 if (rootChunk.getNumEntries() > 0)
951 throw new IOException("Root-level entries already added in " +
952 "single-level mode");
953
954 rootChunk = curInlineChunk;
955 curInlineChunk = new BlockIndexChunk();
956
957 if (LOG.isTraceEnabled()) {
958 LOG.trace("Wrote a single-level " + description + " index with "
959 + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize()
960 + " bytes");
961 }
962 rootChunk.writeRoot(out);
963 }
964
965 /**
966 * Split the current level of the block index into intermediate index
967 * blocks of permitted size and write those blocks to disk. Return the next
968 * level of the block index referencing those intermediate-level blocks.
969 *
970 * @param out
971 * @param currentLevel the current level of the block index, such as the a
972 * chunk referencing all leaf-level index blocks
973 * @return the parent level block index, which becomes the root index after
974 * a few (usually zero) iterations
975 * @throws IOException
976 */
977 private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
978 BlockIndexChunk currentLevel) throws IOException {
979 // Entries referencing intermediate-level blocks we are about to create.
980 BlockIndexChunk parent = new BlockIndexChunk();
981
982 // The current intermediate-level block index chunk.
983 BlockIndexChunk curChunk = new BlockIndexChunk();
984
985 for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
986 curChunk.add(currentLevel.getBlockKey(i),
987 currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i));
988
989 // HBASE-16288: We have to have at least minIndexNumEntries(16) items in the index so that
990 // we won't end up with too-many levels for a index with very large rowKeys. Also, if the
991 // first key is larger than maxChunkSize this will cause infinite recursion.
992 if (i >= minIndexNumEntries && curChunk.getRootSize() >= maxChunkSize) {
993 writeIntermediateBlock(out, parent, curChunk);
994 }
995 }
996
997 if (curChunk.getNumEntries() > 0) {
998 writeIntermediateBlock(out, parent, curChunk);
999 }
1000
1001 return parent;
1002 }
1003
1004 private void writeIntermediateBlock(FSDataOutputStream out,
1005 BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException {
1006 long beginOffset = out.getPos();
1007 DataOutputStream dos = blockWriter.startWriting(
1008 BlockType.INTERMEDIATE_INDEX);
1009 curChunk.writeNonRoot(dos);
1010 byte[] curFirstKey = curChunk.getBlockKey(0);
1011 blockWriter.writeHeaderAndData(out);
1012
1013 if (getCacheOnWrite()) {
1014 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
1015 cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching,
1016 beginOffset, true, blockForCaching.getBlockType()), blockForCaching);
1017 }
1018
1019 // Add intermediate index block size
1020 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
1021 totalBlockUncompressedSize +=
1022 blockWriter.getUncompressedSizeWithoutHeader();
1023
1024 // OFFSET is the beginning offset the chunk of block index entries.
1025 // SIZE is the total byte size of the chunk of block index entries
1026 // + the secondary index size
1027 // FIRST_KEY is the first key in the chunk of block index
1028 // entries.
1029 parent.add(curFirstKey, beginOffset,
1030 blockWriter.getOnDiskSizeWithHeader());
1031
1032 // clear current block index chunk
1033 curChunk.clear();
1034 curFirstKey = null;
1035 }
1036
1037 /**
1038 * @return how many block index entries there are in the root level
1039 */
1040 public final int getNumRootEntries() {
1041 return rootChunk.getNumEntries();
1042 }
1043
1044 /**
1045 * @return the number of levels in this block index.
1046 */
1047 public int getNumLevels() {
1048 return numLevels;
1049 }
1050
1051 private void expectNumLevels(int expectedNumLevels) {
1052 if (numLevels != expectedNumLevels) {
1053 throw new IllegalStateException("Number of block index levels is "
1054 + numLevels + "but is expected to be " + expectedNumLevels);
1055 }
1056 }
1057
1058 /**
1059 * Whether there is an inline block ready to be written. In general, we
1060 * write an leaf-level index block as an inline block as soon as its size
1061 * as serialized in the non-root format reaches a certain threshold.
1062 */
1063 @Override
1064 public boolean shouldWriteBlock(boolean closing) {
1065 if (singleLevelOnly) {
1066 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1067 }
1068
1069 if (curInlineChunk == null) {
1070 throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been " +
1071 "called with closing=true and then called again?");
1072 }
1073
1074 if (curInlineChunk.getNumEntries() == 0) {
1075 return false;
1076 }
1077
1078 // We do have some entries in the current inline chunk.
1079 if (closing) {
1080 if (rootChunk.getNumEntries() == 0) {
1081 // We did not add any leaf-level blocks yet. Instead of creating a
1082 // leaf level with one block, move these entries to the root level.
1083
1084 expectNumLevels(1);
1085 rootChunk = curInlineChunk;
1086 curInlineChunk = null; // Disallow adding any more index entries.
1087 return false;
1088 }
1089
1090 return true;
1091 } else {
1092 return curInlineChunk.getNonRootSize() >= maxChunkSize;
1093 }
1094 }
1095
1096 /**
1097 * Write out the current inline index block. Inline blocks are non-root
1098 * blocks, so the non-root index format is used.
1099 *
1100 * @param out
1101 */
1102 @Override
1103 public void writeInlineBlock(DataOutput out) throws IOException {
1104 if (singleLevelOnly)
1105 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1106
1107 // Write the inline block index to the output stream in the non-root
1108 // index block format.
1109 curInlineChunk.writeNonRoot(out);
1110
1111 // Save the first key of the inline block so that we can add it to the
1112 // parent-level index.
1113 firstKey = curInlineChunk.getBlockKey(0);
1114
1115 // Start a new inline index block
1116 curInlineChunk.clear();
1117 }
1118
1119 /**
1120 * Called after an inline block has been written so that we can add an
1121 * entry referring to that block to the parent-level index.
1122 */
1123 @Override
1124 public void blockWritten(long offset, int onDiskSize, int uncompressedSize) {
1125 // Add leaf index block size
1126 totalBlockOnDiskSize += onDiskSize;
1127 totalBlockUncompressedSize += uncompressedSize;
1128
1129 if (singleLevelOnly)
1130 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1131
1132 if (firstKey == null) {
1133 throw new IllegalStateException("Trying to add second-level index " +
1134 "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
1135 "but the first key was not set in writeInlineBlock");
1136 }
1137
1138 if (rootChunk.getNumEntries() == 0) {
1139 // We are writing the first leaf block, so increase index level.
1140 expectNumLevels(1);
1141 numLevels = 2;
1142 }
1143
1144 // Add another entry to the second-level index. Include the number of
1145 // entries in all previous leaf-level chunks for mid-key calculation.
1146 rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
1147 firstKey = null;
1148 }
1149
1150 @Override
1151 public BlockType getInlineBlockType() {
1152 return BlockType.LEAF_INDEX;
1153 }
1154
1155 /**
1156 * Add one index entry to the current leaf-level block. When the leaf-level
1157 * block gets large enough, it will be flushed to disk as an inline block.
1158 *
1159 * @param firstKey the first key of the data block
1160 * @param blockOffset the offset of the data block
1161 * @param blockDataSize the on-disk size of the data block ({@link HFile}
1162 * format version 2), or the uncompressed size of the data block (
1163 * {@link HFile} format version 1).
1164 */
1165 public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) {
1166 curInlineChunk.add(firstKey, blockOffset, blockDataSize);
1167 ++totalNumEntries;
1168 }
1169
1170 /**
1171 * @throws IOException if we happened to write a multi-level index.
1172 */
1173 public void ensureSingleLevel() throws IOException {
1174 if (numLevels > 1) {
1175 throw new IOException ("Wrote a " + numLevels + "-level index with " +
1176 rootChunk.getNumEntries() + " root-level entries, but " +
1177 "this is expected to be a single-level block index.");
1178 }
1179 }
1180
1181 /**
1182 * @return true if we are using cache-on-write. This is configured by the
1183 * caller of the constructor by either passing a valid block cache
1184 * or null.
1185 */
1186 @Override
1187 public boolean getCacheOnWrite() {
1188 return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite();
1189 }
1190
1191 /**
1192 * The total uncompressed size of the root index block, intermediate-level
1193 * index blocks, and leaf-level index blocks.
1194 *
1195 * @return the total uncompressed size of all index blocks
1196 */
1197 public long getTotalUncompressedSize() {
1198 return totalBlockUncompressedSize;
1199 }
1200
1201 }
1202
1203 /**
1204 * A single chunk of the block index in the process of writing. The data in
1205 * this chunk can become a leaf-level, intermediate-level, or root index
1206 * block.
1207 */
1208 static class BlockIndexChunk {
1209
1210 /** First keys of the key range corresponding to each index entry. */
1211 private final List<byte[]> blockKeys = new ArrayList<byte[]>();
1212
1213 /** Block offset in backing stream. */
1214 private final List<Long> blockOffsets = new ArrayList<Long>();
1215
1216 /** On-disk data sizes of lower-level data or index blocks. */
1217 private final List<Integer> onDiskDataSizes = new ArrayList<Integer>();
1218
1219 /**
1220 * The cumulative number of sub-entries, i.e. entries on deeper-level block
1221 * index entries. numSubEntriesAt[i] is the number of sub-entries in the
1222 * blocks corresponding to this chunk's entries #0 through #i inclusively.
1223 */
1224 private final List<Long> numSubEntriesAt = new ArrayList<Long>();
1225
1226 /**
1227 * The offset of the next entry to be added, relative to the end of the
1228 * "secondary index" in the "non-root" format representation of this index
1229 * chunk. This is the next value to be added to the secondary index.
1230 */
1231 private int curTotalNonRootEntrySize = 0;
1232
1233 /**
1234 * The accumulated size of this chunk if stored in the root index format.
1235 */
1236 private int curTotalRootSize = 0;
1237
1238 /**
1239 * The "secondary index" used for binary search over variable-length
1240 * records in a "non-root" format block. These offsets are relative to the
1241 * end of this secondary index.
1242 */
1243 private final List<Integer> secondaryIndexOffsetMarks =
1244 new ArrayList<Integer>();
1245
1246 /**
1247 * Adds a new entry to this block index chunk.
1248 *
1249 * @param firstKey the first key in the block pointed to by this entry
1250 * @param blockOffset the offset of the next-level block pointed to by this
1251 * entry
1252 * @param onDiskDataSize the on-disk data of the block pointed to by this
1253 * entry, including header size
1254 * @param curTotalNumSubEntries if this chunk is the root index chunk under
1255 * construction, this specifies the current total number of
1256 * sub-entries in all leaf-level chunks, including the one
1257 * corresponding to the second-level entry being added.
1258 */
1259 void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
1260 long curTotalNumSubEntries) {
1261 // Record the offset for the secondary index
1262 secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
1263 curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
1264 + firstKey.length;
1265
1266 curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
1267 + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;
1268
1269 blockKeys.add(firstKey);
1270 blockOffsets.add(blockOffset);
1271 onDiskDataSizes.add(onDiskDataSize);
1272
1273 if (curTotalNumSubEntries != -1) {
1274 numSubEntriesAt.add(curTotalNumSubEntries);
1275
1276 // Make sure the parallel arrays are in sync.
1277 if (numSubEntriesAt.size() != blockKeys.size()) {
1278 throw new IllegalStateException("Only have key/value count " +
1279 "stats for " + numSubEntriesAt.size() + " block index " +
1280 "entries out of " + blockKeys.size());
1281 }
1282 }
1283 }
1284
1285 /**
1286 * The same as {@link #add(byte[], long, int, long)} but does not take the
1287 * key/value into account. Used for single-level indexes.
1288 *
1289 * @see {@link #add(byte[], long, int, long)}
1290 */
1291 public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
1292 add(firstKey, blockOffset, onDiskDataSize, -1);
1293 }
1294
1295 public void clear() {
1296 blockKeys.clear();
1297 blockOffsets.clear();
1298 onDiskDataSizes.clear();
1299 secondaryIndexOffsetMarks.clear();
1300 numSubEntriesAt.clear();
1301 curTotalNonRootEntrySize = 0;
1302 curTotalRootSize = 0;
1303 }
1304
1305 /**
1306 * Finds the entry corresponding to the deeper-level index block containing
1307 * the given deeper-level entry (a "sub-entry"), assuming a global 0-based
1308 * ordering of sub-entries.
1309 *
1310 * <p>
1311 * <i> Implementation note. </i> We are looking for i such that
1312 * numSubEntriesAt[i - 1] <= k < numSubEntriesAt[i], because a deeper-level
1313 * block #i (0-based) contains sub-entries # numSubEntriesAt[i - 1]'th
1314 * through numSubEntriesAt[i] - 1, assuming a global 0-based ordering of
1315 * sub-entries. i is by definition the insertion point of k in
1316 * numSubEntriesAt.
1317 *
1318 * @param k sub-entry index, from 0 to the total number sub-entries - 1
1319 * @return the 0-based index of the entry corresponding to the given
1320 * sub-entry
1321 */
1322 public int getEntryBySubEntry(long k) {
1323 // We define mid-key as the key corresponding to k'th sub-entry
1324 // (0-based).
1325
1326 int i = Collections.binarySearch(numSubEntriesAt, k);
1327
1328 // Exact match: cumulativeWeight[i] = k. This means chunks #0 through
1329 // #i contain exactly k sub-entries, and the sub-entry #k (0-based)
1330 // is in the (i + 1)'th chunk.
1331 if (i >= 0)
1332 return i + 1;
1333
1334 // Inexact match. Return the insertion point.
1335 return -i - 1;
1336 }
1337
1338 /**
1339 * Used when writing the root block index of a multi-level block index.
1340 * Serializes additional information allowing to efficiently identify the
1341 * mid-key.
1342 *
1343 * @return a few serialized fields for finding the mid-key
1344 * @throws IOException if could not create metadata for computing mid-key
1345 */
1346 public byte[] getMidKeyMetadata() throws IOException {
1347 ByteArrayOutputStream baos = new ByteArrayOutputStream(
1348 MID_KEY_METADATA_SIZE);
1349 DataOutputStream baosDos = new DataOutputStream(baos);
1350 long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
1351 if (totalNumSubEntries == 0) {
1352 throw new IOException("No leaf-level entries, mid-key unavailable");
1353 }
1354 long midKeySubEntry = (totalNumSubEntries - 1) / 2;
1355 int midKeyEntry = getEntryBySubEntry(midKeySubEntry);
1356
1357 baosDos.writeLong(blockOffsets.get(midKeyEntry));
1358 baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));
1359
1360 long numSubEntriesBefore = midKeyEntry > 0
1361 ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
1362 long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
1363 if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE)
1364 {
1365 throw new IOException("Could not identify mid-key index within the "
1366 + "leaf-level block containing mid-key: out of range ("
1367 + subEntryWithinEntry + ", numSubEntriesBefore="
1368 + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
1369 + ")");
1370 }
1371
1372 baosDos.writeInt((int) subEntryWithinEntry);
1373
1374 if (baosDos.size() != MID_KEY_METADATA_SIZE) {
1375 throw new IOException("Could not write mid-key metadata: size=" +
1376 baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE);
1377 }
1378
1379 // Close just to be good citizens, although this has no effect.
1380 baos.close();
1381
1382 return baos.toByteArray();
1383 }
1384
1385 /**
1386 * Writes the block index chunk in the non-root index block format. This
1387 * format contains the number of entries, an index of integer offsets
1388 * for quick binary search on variable-length records, and tuples of
1389 * block offset, on-disk block size, and the first key for each entry.
1390 *
1391 * @param out
1392 * @throws IOException
1393 */
1394 void writeNonRoot(DataOutput out) throws IOException {
1395 // The number of entries in the block.
1396 out.writeInt(blockKeys.size());
1397
1398 if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
1399 throw new IOException("Corrupted block index chunk writer: " +
1400 blockKeys.size() + " entries but " +
1401 secondaryIndexOffsetMarks.size() + " secondary index items");
1402 }
1403
1404 // For each entry, write a "secondary index" of relative offsets to the
1405 // entries from the end of the secondary index. This works, because at
1406 // read time we read the number of entries and know where the secondary
1407 // index ends.
1408 for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
1409 out.writeInt(currentSecondaryIndex);
1410
1411 // We include one other element in the secondary index to calculate the
1412 // size of each entry more easily by subtracting secondary index elements.
1413 out.writeInt(curTotalNonRootEntrySize);
1414
1415 for (int i = 0; i < blockKeys.size(); ++i) {
1416 out.writeLong(blockOffsets.get(i));
1417 out.writeInt(onDiskDataSizes.get(i));
1418 out.write(blockKeys.get(i));
1419 }
1420 }
1421
1422 /**
1423 * @return the size of this chunk if stored in the non-root index block
1424 * format
1425 */
1426 int getNonRootSize() {
1427 return Bytes.SIZEOF_INT // Number of entries
1428 + Bytes.SIZEOF_INT * (blockKeys.size() + 1) // Secondary index
1429 + curTotalNonRootEntrySize; // All entries
1430 }
1431
1432 /**
1433 * Writes this chunk into the given output stream in the root block index
1434 * format. This format is similar to the {@link HFile} version 1 block
1435 * index format, except that we store on-disk size of the block instead of
1436 * its uncompressed size.
1437 *
1438 * @param out the data output stream to write the block index to. Typically
1439 * a stream writing into an {@link HFile} block.
1440 * @throws IOException
1441 */
1442 void writeRoot(DataOutput out) throws IOException {
1443 for (int i = 0; i < blockKeys.size(); ++i) {
1444 out.writeLong(blockOffsets.get(i));
1445 out.writeInt(onDiskDataSizes.get(i));
1446 Bytes.writeByteArray(out, blockKeys.get(i));
1447 }
1448 }
1449
1450 /**
1451 * @return the size of this chunk if stored in the root index block format
1452 */
1453 int getRootSize() {
1454 return curTotalRootSize;
1455 }
1456
1457 /**
1458 * @return the number of entries in this block index chunk
1459 */
1460 public int getNumEntries() {
1461 return blockKeys.size();
1462 }
1463
1464 public byte[] getBlockKey(int i) {
1465 return blockKeys.get(i);
1466 }
1467
1468 public long getBlockOffset(int i) {
1469 return blockOffsets.get(i);
1470 }
1471
1472 public int getOnDiskDataSize(int i) {
1473 return onDiskDataSizes.get(i);
1474 }
1475
1476 public long getCumulativeNumKV(int i) {
1477 if (i < 0)
1478 return 0;
1479 return numSubEntriesAt.get(i);
1480 }
1481
1482 }
1483
1484 public static int getMaxChunkSize(Configuration conf) {
1485 return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
1486 }
1487
1488 public static int getMinIndexNumEntries(Configuration conf) {
1489 return conf.getInt(MIN_INDEX_NUM_ENTRIES_KEY, DEFAULT_MIN_INDEX_NUM_ENTRIES);
1490 }
1491 }