View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import com.google.common.base.Preconditions;
21  import java.io.DataInputStream;
22  import java.io.DataOutput;
23  import java.io.DataOutputStream;
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.nio.ByteBuffer;
27  import java.util.concurrent.atomic.AtomicReference;
28  import java.util.concurrent.locks.Lock;
29  import java.util.concurrent.locks.ReentrantLock;
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.fs.FSDataInputStream;
33  import org.apache.hadoop.fs.FSDataOutputStream;
34  import org.apache.hadoop.fs.Path;
35  import org.apache.hadoop.hbase.Cell;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.classification.InterfaceAudience;
38  import org.apache.hadoop.hbase.fs.HFileSystem;
39  import org.apache.hadoop.hbase.io.ByteArrayOutputStream;
40  import org.apache.hadoop.hbase.io.ByteBufferInputStream;
41  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
42  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
43  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
44  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext;
45  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultEncodingContext;
46  import org.apache.hadoop.hbase.io.encoding.HFileBlockEncodingContext;
47  import org.apache.hadoop.hbase.util.ByteBufferUtils;
48  import org.apache.hadoop.hbase.util.Bytes;
49  import org.apache.hadoop.hbase.util.ChecksumType;
50  import org.apache.hadoop.hbase.util.ClassSize;
51  import org.apache.hadoop.io.IOUtils;
52  
53  /**
54   * Reads {@link HFile} version 2 blocks to HFiles and via {@link Cacheable} Interface to caches.
55   * Version 2 was introduced in hbase-0.92.0. No longer has support for version 1 blocks since
56   * hbase-1.3.0.
57   *
58   * <p>Version 1 was the original file block. Version 2 was introduced when we changed the hbase file
59   * format to support multi-level block indexes and compound bloom filters (HBASE-3857).
60   *
61   * <h3>HFileBlock: Version 2</h3>
62   * In version 2, a block is structured as follows:
63   * <ul>
64   * <li><b>Header:</b> See Writer#putHeader() for where header is written; header total size is
65   * HFILEBLOCK_HEADER_SIZE
66   * <ul>
67   * <li>0. blockType: Magic record identifying the {@link BlockType} (8 bytes):
68   * e.g. <code>DATABLK*</code>
69   * <li>1. onDiskSizeWithoutHeader: Compressed -- a.k.a 'on disk' -- block size, excluding header,
70   * but including tailing checksum bytes (4 bytes)
71   * <li>2. uncompressedSizeWithoutHeader: Uncompressed block size, excluding header, and excluding
72   * checksum bytes (4 bytes)
73   * <li>3. prevBlockOffset: The offset of the previous block of the same type (8 bytes). This is
74   * used to navigate to the previous block without having to go to the block index
75   * <li>4: For minorVersions &gt;=1, the ordinal describing checksum type (1 byte)
76   * <li>5: For minorVersions &gt;=1, the number of data bytes/checksum chunk (4 bytes)
77   * <li>6: onDiskDataSizeWithHeader: For minorVersions &gt;=1, the size of data 'on disk', including
78   * header, excluding checksums (4 bytes)
79   * </ul>
80   * </li>
81   * <li><b>Raw/Compressed/Encrypted/Encoded data:</b> The compression
82   * algorithm is the same for all the blocks in an {@link HFile}. If compression is NONE, this is
83   * just raw, serialized Cells.
84   * <li><b>Tail:</b> For minorVersions &gt;=1, a series of 4 byte checksums, one each for
85   * the number of bytes specified by bytesPerChecksum.
86   * </ul>
87   * <h3>Caching</h3>
88   * Caches cache whole blocks with trailing checksums if any. We then tag on some metadata, the
89   * content of BLOCK_METADATA_SPACE which will be flag on if we are doing 'hbase'
90   * checksums and then the offset into the file which is needed when we re-make a cache key
91   * when we return the block to the cache as 'done'. See {@link Cacheable#serialize(ByteBuffer, boolean)} and
92   * {@link Cacheable#getDeserializer()}.
93   *
94   * <p>TODO: Should we cache the checksums? Down in Writer#getBlockForCaching(CacheConfig) where
95   * we make a block to cache-on-write, there is an attempt at turning off checksums. This is not the
96   * only place we get blocks to cache. We also will cache the raw return from an hdfs read. In this
97   * case, the checksums may be present. If the cache is backed by something that doesn't do ECC,
98   * say an SSD, we might want to preserve checksums. For now this is open question.
99   * <p>TODO: Over in BucketCache, we save a block allocation by doing a custom serialization.
100  * Be sure to change it if serialization changes in here. Could we add a method here that takes an
101  * IOEngine and that then serializes to it rather than expose our internals over in BucketCache?
102  * IOEngine is in the bucket subpackage. Pull it up? Then this class knows about bucketcache. Ugh.
103  */
104 @InterfaceAudience.Private
105 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="HE_EQUALS_USE_HASHCODE",
106   justification="Fix!!! Fine for now bug FIXXXXXXX!!!!")
107 public class HFileBlock implements Cacheable {
108   private static final Log LOG = LogFactory.getLog(HFileBlock.class);
109 
110   /** Type of block. Header field 0. */
111   private BlockType blockType;
112 
113   /**
114    * Size on disk excluding header, including checksum. Header field 1.
115    * @see Writer#putHeader(byte[], int, int, int, int)
116    */
117   private int onDiskSizeWithoutHeader;
118 
119   /**
120    * Size of pure data. Does not include header or checksums. Header field 2.
121    * @see Writer#putHeader(byte[], int, int, int, int)
122    */
123   private int uncompressedSizeWithoutHeader;
124 
125   /**
126    * The offset of the previous block on disk. Header field 3.
127    * @see Writer#putHeader(byte[], int, int, int, int)
128    */
129   private long prevBlockOffset;
130 
131   /**
132    * Size on disk of header + data. Excludes checksum. Header field 6,
133    * OR calculated from {@link #onDiskSizeWithoutHeader} when using HDFS checksum.
134    * @see Writer#putHeader(byte[], int, int, int, int)
135    */
136   private int onDiskDataSizeWithHeader;
137 
138 
139   /**
140    * The in-memory representation of the hfile block. Can be on or offheap. Can be backed by
141    * a single ByteBuffer or by many. Make no assumptions.
142    *
143    * <p>Be careful reading from this <code>buf</code>. Duplicate and work on the duplicate or if
144    * not, be sure to reset position and limit else trouble down the road.
145    *
146    * <p>TODO: Make this read-only once made.
147    */
148   private ByteBuffer buf;
149 
150   /** Meta data that holds meta information on the hfileblock.
151    */
152   private HFileContext fileContext;
153 
154   /**
155    * The offset of this block in the file. Populated by the reader for
156    * convenience of access. This offset is not part of the block header.
157    */
158   private long offset = UNSET;
159 
160   /**
161    * The on-disk size of the next block, including the header and checksums if present, obtained by
162    * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the next block's
163    * header, or UNSET if unknown.
164    *
165    * Blocks try to carry the size of the next block to read in this data member. They will even have
166    * this value when served from cache. Could save a seek in the case where we are iterating through
167    * a file and some of the blocks come from cache. If from cache, then having this info to hand
168    * will save us doing a seek to read the header so we can read the body of a block.
169    * TODO: see how effective this is at saving seeks.
170    */
171   private int nextBlockOnDiskSize = UNSET;
172 
173   /**
174    * On a checksum failure, do these many succeeding read requests using hdfs checksums before
175    * auto-reenabling hbase checksum verification.
176    */
177   static final int CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD = 3;
178 
179   private static int UNSET = -1;
180   public static final boolean FILL_HEADER = true;
181   public static final boolean DONT_FILL_HEADER = false;
182 
183   public static final int BYTE_BUFFER_HEAP_SIZE = (int) ClassSize.estimateBase(
184       ByteBuffer.wrap(new byte[0], 0, 0).getClass(), false);
185 
186   /**
187    * Space for metadata on a block that gets stored along with the block when we cache it.
188    * There are a few bytes stuck on the end of the HFileBlock that we pull in from HDFS (note,
189    * when we read from HDFS, we pull in an HFileBlock AND the header of the next block if one).
190    * 8 bytes are offset of this block (long) in the file. Offset is important because
191    * used when we remake the CacheKey when we return the block to cache when done. There is also
192    * a flag on whether checksumming is being done by hbase or not. See class comment for note on
193    * uncertain state of checksumming of blocks that come out of cache (should we or should we not?).
194    * Finally there 4 bytes to hold the length of the next block which can save a seek on occasion.
195    * <p>This EXTRA came in with original commit of the bucketcache, HBASE-7404. Was formerly
196    * known as EXTRA_SERIALIZATION_SPACE.
197    */
198   static final int BLOCK_METADATA_SPACE = Bytes.SIZEOF_BYTE + Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT;
199 
200   /**
201    * Each checksum value is an integer that can be stored in 4 bytes.
202    */
203   static final int CHECKSUM_SIZE = Bytes.SIZEOF_INT;
204 
205   static final byte[] DUMMY_HEADER_NO_CHECKSUM =
206       new byte[HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM];
207 
208   /**
209    * Used deserializing blocks from Cache.
210    *
211    * <code>
212    * ++++++++++++++
213    * + HFileBlock +
214    * ++++++++++++++
215    * + Checksums  + <= Optional
216    * ++++++++++++++
217    * + Metadata!  +
218    * ++++++++++++++
219    * </code>
220    * @see #serialize(ByteBuffer)
221    */
222   static final CacheableDeserializer<Cacheable> BLOCK_DESERIALIZER =
223       new CacheableDeserializer<Cacheable>() {
224         @Override
225         public HFileBlock deserialize(ByteBuffer buf, boolean reuse) throws IOException{
226           // The buf has the file block followed by block metadata.
227           // Set limit to just before the BLOCK_METADATA_SPACE then rewind.
228           buf.limit(buf.limit() - BLOCK_METADATA_SPACE).rewind();
229           // Get a new buffer to pass the HFileBlock for it to 'own'.
230           ByteBuffer newByteBuff;
231           if (reuse) {
232             newByteBuff = buf.slice();
233           } else {
234             int len = buf.limit();
235             newByteBuff = ByteBuffer.allocate(len);
236             ByteBufferUtils.copyFromBufferToBuffer(newByteBuff, buf, buf.position(), 0, len);
237           }
238           // Read out the BLOCK_METADATA_SPACE content and shove into our HFileBlock.
239           buf.position(buf.limit());
240           buf.limit(buf.limit() + HFileBlock.BLOCK_METADATA_SPACE);
241           boolean usesChecksum = buf.get() == (byte)1;
242           long offset = buf.getLong();
243           int nextBlockOnDiskSize = buf.getInt();
244           HFileBlock hFileBlock =
245               new HFileBlock(newByteBuff, usesChecksum, offset, nextBlockOnDiskSize, null);
246           return hFileBlock;
247         }
248 
249         @Override
250         public int getDeserialiserIdentifier() {
251           return DESERIALIZER_IDENTIFIER;
252         }
253 
254         @Override
255         public HFileBlock deserialize(ByteBuffer b) throws IOException {
256           return deserialize(b, false);
257         }
258       };
259   private static final int DESERIALIZER_IDENTIFIER;
260   static {
261     DESERIALIZER_IDENTIFIER =
262         CacheableDeserializerIdManager.registerDeserializer(BLOCK_DESERIALIZER);
263   }
264 
265   // Todo: encapsulate Header related logic in this inner class.
266   static class Header {
267     // Format of header is:
268     // 8 bytes - block magic
269     // 4 bytes int - onDiskSizeWithoutHeader
270     // 4 bytes int - uncompressedSizeWithoutHeader
271     // 8 bytes long - prevBlockOffset
272     // The following 3 are only present if header contains checksum information
273     // 1 byte - checksum type
274     // 4 byte int - bytes per checksum
275     // 4 byte int - onDiskDataSizeWithHeader
276     static int BLOCK_MAGIC_INDEX = 0;
277     static int ON_DISK_SIZE_WITHOUT_HEADER_INDEX = 8;
278     static int UNCOMPRESSED_SIZE_WITHOUT_HEADER_INDEX = 12;
279     static int PREV_BLOCK_OFFSET_INDEX = 16;
280     static int CHECKSUM_TYPE_INDEX = 24;
281     static int BYTES_PER_CHECKSUM_INDEX = 25;
282     static int ON_DISK_DATA_SIZE_WITH_HEADER_INDEX = 29;
283   }
284 
285   /**
286    * Copy constructor. Creates a shallow copy of {@code that}'s buffer.
287    */
288   private HFileBlock(HFileBlock that) {
289     this.blockType = that.blockType;
290     this.onDiskSizeWithoutHeader = that.onDiskSizeWithoutHeader;
291     this.uncompressedSizeWithoutHeader = that.uncompressedSizeWithoutHeader;
292     this.prevBlockOffset = that.prevBlockOffset;
293     this.buf = that.buf.duplicate();
294     this.offset = that.offset;
295     this.onDiskDataSizeWithHeader = that.onDiskDataSizeWithHeader;
296     this.fileContext = that.fileContext;
297     this.nextBlockOnDiskSize = that.nextBlockOnDiskSize;
298   }
299 
300   /**
301    * Creates a new {@link HFile} block from the given fields. This constructor
302    * is used only while writing blocks and caching,
303    * and is sitting in a byte buffer and we want to stuff the block into cache.
304    * See {@link Writer#getBlockForCaching(CacheConfig)}.
305    *
306    * <p>TODO: The caller presumes no checksumming
307    * required of this block instance since going into cache; checksum already verified on
308    * underlying block data pulled in from filesystem. Is that correct? What if cache is SSD?
309    *
310    * @param blockType the type of this block, see {@link BlockType}
311    * @param onDiskSizeWithoutHeader see {@link #onDiskSizeWithoutHeader}
312    * @param uncompressedSizeWithoutHeader see {@link #uncompressedSizeWithoutHeader}
313    * @param prevBlockOffset see {@link #prevBlockOffset}
314    * @param b block header ({@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes)
315    * @param fillHeader when true, write the first 4 header fields into passed buffer.
316    * @param offset the file offset the block was read from
317    * @param onDiskDataSizeWithHeader see {@link #onDiskDataSizeWithHeader}
318    * @param fileContext HFile meta data
319    */
320   public HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader,
321       int uncompressedSizeWithoutHeader, long prevBlockOffset, ByteBuffer b, boolean fillHeader,
322       long offset, final int nextBlockOnDiskSize, int onDiskDataSizeWithHeader,
323       HFileContext fileContext) {
324     init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader,
325         prevBlockOffset, offset, onDiskDataSizeWithHeader, nextBlockOnDiskSize, fileContext);
326     this.buf = b;
327     if (fillHeader) {
328       overwriteHeader();
329     }
330     this.buf.rewind();
331   }
332 
333   /**
334    * Creates a block from an existing buffer starting with a header. Rewinds
335    * and takes ownership of the buffer. By definition of rewind, ignores the
336    * buffer position, but if you slice the buffer beforehand, it will rewind
337    * to that point.
338    * @param buf Has header, content, and trailing checksums if present.
339    */
340   HFileBlock(ByteBuffer buf, boolean usesHBaseChecksum, final long offset,
341       final int nextBlockOnDiskSize, HFileContext fileContext) throws IOException {
342     buf.rewind();
343     final BlockType blockType = BlockType.read(buf);
344     final int onDiskSizeWithoutHeader = buf.getInt(Header.ON_DISK_SIZE_WITHOUT_HEADER_INDEX);
345     final int uncompressedSizeWithoutHeader =
346         buf.getInt(Header.UNCOMPRESSED_SIZE_WITHOUT_HEADER_INDEX);
347     final long prevBlockOffset = buf.getLong(Header.PREV_BLOCK_OFFSET_INDEX);
348     // This constructor is called when we deserialize a block from cache and when we read a block in
349     // from the fs. fileCache is null when deserialized from cache so need to make up one.
350     HFileContextBuilder fileContextBuilder = fileContext != null?
351         new HFileContextBuilder(fileContext): new HFileContextBuilder();
352     fileContextBuilder.withHBaseCheckSum(usesHBaseChecksum);
353     int onDiskDataSizeWithHeader;
354     if (usesHBaseChecksum) {
355       byte checksumType = buf.get(Header.CHECKSUM_TYPE_INDEX);
356       int bytesPerChecksum = buf.getInt(Header.BYTES_PER_CHECKSUM_INDEX);
357       onDiskDataSizeWithHeader = buf.getInt(Header.ON_DISK_DATA_SIZE_WITH_HEADER_INDEX);
358       // Use the checksum type and bytes per checksum from header, not from filecontext.
359       fileContextBuilder.withChecksumType(ChecksumType.codeToType(checksumType));
360       fileContextBuilder.withBytesPerCheckSum(bytesPerChecksum);
361     } else {
362       fileContextBuilder.withChecksumType(ChecksumType.NULL);
363       fileContextBuilder.withBytesPerCheckSum(0);
364       // Need to fix onDiskDataSizeWithHeader; there are not checksums after-block-data
365       onDiskDataSizeWithHeader = onDiskSizeWithoutHeader + headerSize(usesHBaseChecksum);
366     }
367     fileContext = fileContextBuilder.build();
368     assert usesHBaseChecksum == fileContext.isUseHBaseChecksum();
369     init(blockType, onDiskSizeWithoutHeader, uncompressedSizeWithoutHeader,
370         prevBlockOffset, offset, onDiskDataSizeWithHeader, nextBlockOnDiskSize, fileContext);
371     this.offset = offset;
372     this.buf = buf;
373     this.buf.rewind();
374  }
375 
376   /**
377    * Called from constructors.
378    */
379   private void init(BlockType blockType, int onDiskSizeWithoutHeader,
380       int uncompressedSizeWithoutHeader, long prevBlockOffset,
381       long offset, int onDiskDataSizeWithHeader, final int nextBlockOnDiskSize,
382       HFileContext fileContext) {
383     this.blockType = blockType;
384     this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader;
385     this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader;
386     this.prevBlockOffset = prevBlockOffset;
387     this.offset = offset;
388     this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader;
389     this.nextBlockOnDiskSize = nextBlockOnDiskSize;
390     this.fileContext = fileContext;
391   }
392 
393   /**
394    * Parse total ondisk size including header and checksum.
395    * @param headerBuf Header ByteBuffer. Presumed exact size of header.
396    * @param verifyChecksum true if checksum verification is in use.
397    * @return Size of the block with header included.
398    */
399   private static int getOnDiskSizeWithHeader(final ByteBuffer headerBuf, boolean verifyChecksum) {
400     return headerBuf.getInt(Header.ON_DISK_SIZE_WITHOUT_HEADER_INDEX) +
401       headerSize(verifyChecksum);
402   }
403 
404   /**
405    * @return the on-disk size of the next block (including the header size and any checksums if
406    * present) read by peeking into the next block's header; use as a hint when doing
407    * a read of the next block when scanning or running over a file.
408    */
409   public int getNextBlockOnDiskSize() {
410     return nextBlockOnDiskSize;
411   }
412 
413   @Override
414   public BlockType getBlockType() {
415     return blockType;
416   }
417 
418   /** @return get data block encoding id that was used to encode this block */
419   public short getDataBlockEncodingId() {
420     if (blockType != BlockType.ENCODED_DATA) {
421       throw new IllegalArgumentException("Querying encoder ID of a block " +
422           "of type other than " + BlockType.ENCODED_DATA + ": " + blockType);
423     }
424     return buf.getShort(headerSize());
425   }
426 
427   /**
428    * @return the on-disk size of header + data part + checksum.
429    */
430   public int getOnDiskSizeWithHeader() {
431     return onDiskSizeWithoutHeader + headerSize();
432   }
433 
434   /**
435    * @return the on-disk size of the data part + checksum (header excluded).
436    */
437   int getOnDiskSizeWithoutHeader() {
438     return onDiskSizeWithoutHeader;
439   }
440 
441   /**
442    * @return the uncompressed size of data part (header and checksum excluded).
443    */
444   public int getUncompressedSizeWithoutHeader() {
445     return uncompressedSizeWithoutHeader;
446   }
447 
448   /**
449    * @return the offset of the previous block of the same type in the file, or
450    *         -1 if unknown
451    */
452   long getPrevBlockOffset() {
453     return prevBlockOffset;
454   }
455 
456   /**
457    * Rewinds {@code buf} and writes first 4 header fields. {@code buf} position
458    * is modified as side-effect.
459    */
460   private void overwriteHeader() {
461     buf.rewind();
462     blockType.write(buf);
463     buf.putInt(onDiskSizeWithoutHeader);
464     buf.putInt(uncompressedSizeWithoutHeader);
465     buf.putLong(prevBlockOffset);
466     if (this.fileContext.isUseHBaseChecksum()) {
467       buf.put(fileContext.getChecksumType().getCode());
468       buf.putInt(fileContext.getBytesPerChecksum());
469       buf.putInt(onDiskDataSizeWithHeader);
470     }
471   }
472 
473   /**
474    * Returns a buffer that does not include the header or checksum.
475    *
476    * @return the buffer with header skipped and checksum omitted.
477    */
478   public ByteBuffer getBufferWithoutHeader() {
479     ByteBuffer dup = getBufferReadOnly();
480     // Now set it up so Buffer spans content only -- no header or no checksums.
481     dup.position(headerSize()).limit(buf.limit() - totalChecksumBytes());
482     return dup.slice();
483   }
484 
485   /**
486    * Returns a read-only duplicate of the buffer this block stores internally ready to be read.
487    * Clients must not modify the buffer object though they may set position and limit on the
488    * returned buffer since we pass back a duplicate. This method has to be public because it is used
489    * used in {@link org.apache.hadoop.hbase.util.CompoundBloomFilter}
490    * to avoid object creation on every Bloom
491    * filter lookup, but has to be used with caution. Buffer holds header, block content,
492    * and any follow-on checksums if present.
493    *
494    * @return the buffer of this block for read-only operations
495    */
496   public ByteBuffer getBufferReadOnly() {
497     ByteBuffer dup = this.buf.duplicate();
498     return dup;
499   }
500 
501   private void sanityCheckAssertion(long valueFromBuf, long valueFromField,
502       String fieldName) throws IOException {
503     if (valueFromBuf != valueFromField) {
504       throw new AssertionError(fieldName + " in the buffer (" + valueFromBuf
505           + ") is different from that in the field (" + valueFromField + ")");
506     }
507   }
508 
509   private void sanityCheckAssertion(BlockType valueFromBuf, BlockType valueFromField)
510       throws IOException {
511     if (valueFromBuf != valueFromField) {
512       throw new IOException("Block type stored in the buffer: " +
513         valueFromBuf + ", block type field: " + valueFromField);
514     }
515   }
516 
517   /**
518    * Checks if the block is internally consistent, i.e. the first
519    * {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the buffer contain a
520    * valid header consistent with the fields. Assumes a packed block structure.
521    * This function is primary for testing and debugging, and is not
522    * thread-safe, because it alters the internal buffer pointer.
523    * Used by tests only.
524    */
525   void sanityCheck() throws IOException {
526     // Duplicate so no side-effects
527     ByteBuffer dup = this.buf.duplicate();
528     dup.rewind();
529     sanityCheckAssertion(BlockType.read(dup), blockType);
530     sanityCheckAssertion(dup.getInt(), onDiskSizeWithoutHeader, "onDiskSizeWithoutHeader");
531     sanityCheckAssertion(dup.getInt(), uncompressedSizeWithoutHeader,
532         "uncompressedSizeWithoutHeader");
533     sanityCheckAssertion(dup.getLong(), prevBlockOffset, "prevBlockOffset");
534     if (this.fileContext.isUseHBaseChecksum()) {
535       sanityCheckAssertion(dup.get(), this.fileContext.getChecksumType().getCode(), "checksumType");
536       sanityCheckAssertion(dup.getInt(), this.fileContext.getBytesPerChecksum(),
537           "bytesPerChecksum");
538       sanityCheckAssertion(dup.getInt(), onDiskDataSizeWithHeader, "onDiskDataSizeWithHeader");
539     }
540 
541     int cksumBytes = totalChecksumBytes();
542     int expectedBufLimit = onDiskDataSizeWithHeader + cksumBytes;
543     if (dup.limit() != expectedBufLimit) {
544       throw new AssertionError("Expected limit " + expectedBufLimit + ", got " + dup.limit());
545     }
546 
547     // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read the next
548     // block's header, so there are two sensible values for buffer capacity.
549     int hdrSize = headerSize();
550     if (dup.capacity() != expectedBufLimit && dup.capacity() != expectedBufLimit + hdrSize) {
551       throw new AssertionError("Invalid buffer capacity: " + dup.capacity() +
552           ", expected " + expectedBufLimit + " or " + (expectedBufLimit + hdrSize));
553     }
554   }
555 
556   @Override
557   public String toString() {
558     StringBuilder sb = new StringBuilder()
559       .append("[")
560       .append("blockType=").append(blockType)
561       .append(", fileOffset=").append(offset)
562       .append(", headerSize=").append(headerSize())
563       .append(", onDiskSizeWithoutHeader=").append(onDiskSizeWithoutHeader)
564       .append(", uncompressedSizeWithoutHeader=").append(uncompressedSizeWithoutHeader)
565       .append(", prevBlockOffset=").append(prevBlockOffset)
566       .append(", isUseHBaseChecksum=").append(fileContext.isUseHBaseChecksum());
567     if (fileContext.isUseHBaseChecksum()) {
568       sb.append(", checksumType=").append(ChecksumType.codeToType(this.buf.get(24)))
569         .append(", bytesPerChecksum=").append(this.buf.getInt(24 + 1))
570         .append(", onDiskDataSizeWithHeader=").append(onDiskDataSizeWithHeader);
571     } else {
572       sb.append(", onDiskDataSizeWithHeader=").append(onDiskDataSizeWithHeader)
573         .append("(").append(onDiskSizeWithoutHeader)
574         .append("+").append(HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM).append(")");
575     }
576     String dataBegin = null;
577     if (buf.hasArray()) {
578       dataBegin = Bytes.toStringBinary(buf.array(), buf.arrayOffset() + headerSize(),
579           Math.min(32, buf.limit() - buf.arrayOffset() - headerSize()));
580     } else {
581       ByteBuffer bufWithoutHeader = getBufferWithoutHeader();
582       byte[] dataBeginBytes = new byte[Math.min(32,
583           bufWithoutHeader.limit() - bufWithoutHeader.position())];
584       bufWithoutHeader.get(dataBeginBytes);
585       dataBegin = Bytes.toStringBinary(dataBeginBytes);
586     }
587     sb.append(", getOnDiskSizeWithHeader=").append(getOnDiskSizeWithHeader())
588       .append(", totalChecksumBytes=").append(totalChecksumBytes())
589       .append(", isUnpacked=").append(isUnpacked())
590       .append(", buf=[").append(buf).append("]")
591       .append(", dataBeginsWith=").append(dataBegin)
592       .append(", fileContext=").append(fileContext)
593       .append(", nextBlockOnDiskSize=").append(nextBlockOnDiskSize)
594       .append("]");
595     return sb.toString();
596   }
597 
598   /**
599    * Retrieves the decompressed/decrypted view of this block. An encoded block remains in its
600    * encoded structure. Internal structures are shared between instances where applicable.
601    */
602   HFileBlock unpack(HFileContext fileContext, FSReader reader) throws IOException {
603     if (!fileContext.isCompressedOrEncrypted()) {
604       // TODO: cannot use our own fileContext here because HFileBlock(ByteBuffer, boolean),
605       // which is used for block serialization to L2 cache, does not preserve encoding and
606       // encryption details.
607       return this;
608     }
609 
610     HFileBlock unpacked = new HFileBlock(this);
611     unpacked.allocateBuffer(); // allocates space for the decompressed block
612 
613     HFileBlockDecodingContext ctx = blockType == BlockType.ENCODED_DATA ?
614       reader.getBlockDecodingContext() : reader.getDefaultBlockDecodingContext();
615 
616     ByteBuffer dup = this.buf.duplicate();
617     dup.position(this.headerSize());
618     dup = dup.slice();
619     ctx.prepareDecoding(unpacked.getOnDiskSizeWithoutHeader(),
620       unpacked.getUncompressedSizeWithoutHeader(), unpacked.getBufferWithoutHeader(),
621       dup);
622     return unpacked;
623   }
624 
625   /**
626    * Always allocates a new buffer of the correct size. Copies header bytes
627    * from the existing buffer. Does not change header fields.
628    * Reserve room to keep checksum bytes too.
629    */
630   private void allocateBuffer() {
631     int cksumBytes = totalChecksumBytes();
632     int headerSize = headerSize();
633     int capacityNeeded = headerSize + uncompressedSizeWithoutHeader + cksumBytes;
634 
635     // TODO we need consider allocating offheap here?
636     ByteBuffer newBuf = ByteBuffer.allocate(capacityNeeded);
637 
638     // Copy header bytes into newBuf.
639     // newBuf is HBB so no issue in calling array()
640     ByteBuffer dup = buf.duplicate();
641     dup.position(0);
642     dup.get(newBuf.array(), newBuf.arrayOffset(), headerSize);
643 
644     buf = newBuf;
645     // set limit to exclude next block's header
646     buf.limit(headerSize + uncompressedSizeWithoutHeader + cksumBytes);
647   }
648 
649   /**
650    * Return true when this block's buffer has been unpacked, false otherwise. Note this is a
651    * calculated heuristic, not tracked attribute of the block.
652    */
653   public boolean isUnpacked() {
654     final int cksumBytes = totalChecksumBytes();
655     final int headerSize = headerSize();
656     final int expectedCapacity = headerSize + uncompressedSizeWithoutHeader + cksumBytes;
657     final int bufCapacity = buf.capacity();
658     return bufCapacity == expectedCapacity || bufCapacity == expectedCapacity + headerSize;
659   }
660 
661   /** An additional sanity-check in case no compression or encryption is being used. */
662   public void sanityCheckUncompressedSize() throws IOException {
663     if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + totalChecksumBytes()) {
664       throw new IOException("Using no compression but "
665           + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
666           + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
667           + ", numChecksumbytes=" + totalChecksumBytes());
668     }
669   }
670 
671   /**
672    * Cannot be {@link #UNSET}. Must be a legitimate value. Used re-making the {@link CacheKey} when
673    * block is returned to the cache.
674    * @return the offset of this block in the file it was read from
675    */
676   long getOffset() {
677     if (offset < 0) {
678       throw new IllegalStateException("HFile block offset not initialized properly");
679     }
680     return offset;
681   }
682 
683   /**
684    * @return a byte stream reading the data + checksum of this block
685    */
686   public DataInputStream getByteStream() {
687     ByteBuffer dup = this.buf.duplicate();
688     dup.position(this.headerSize());
689     return new DataInputStream(new ByteBufferInputStream(dup));
690   }
691 
692   @Override
693   public long heapSize() {
694     long size = ClassSize.align(
695         ClassSize.OBJECT +
696         // Block type, byte buffer and meta references
697         3 * ClassSize.REFERENCE +
698         // On-disk size, uncompressed size, and next block's on-disk size
699         // bytePerChecksum and onDiskDataSize
700         4 * Bytes.SIZEOF_INT +
701         // This and previous block offset
702         2 * Bytes.SIZEOF_LONG +
703         // Heap size of the meta object. meta will be always not null.
704         fileContext.heapSize()
705     );
706 
707     if (buf != null) {
708       // Deep overhead of the byte buffer. Needs to be aligned separately.
709       size += ClassSize.align(buf.capacity() + BYTE_BUFFER_HEAP_SIZE);
710     }
711 
712     return ClassSize.align(size);
713   }
714 
715   /**
716    * Read from an input stream at least <code>necessaryLen</code> and if possible,
717    * <code>extraLen</code> also if available. Analogous to
718    * {@link IOUtils#readFully(InputStream, byte[], int, int)}, but specifies a
719    * number of "extra" bytes to also optionally read.
720    *
721    * @param in the input stream to read from
722    * @param buf the buffer to read into
723    * @param bufOffset the destination offset in the buffer
724    * @param necessaryLen the number of bytes that are absolutely necessary to read
725    * @param extraLen the number of extra bytes that would be nice to read
726    * @return true if succeeded reading the extra bytes
727    * @throws IOException if failed to read the necessary bytes
728    */
729   static boolean readWithExtra(InputStream in, byte[] buf,
730       int bufOffset, int necessaryLen, int extraLen) throws IOException {
731     int bytesRemaining = necessaryLen + extraLen;
732     while (bytesRemaining > 0) {
733       int ret = in.read(buf, bufOffset, bytesRemaining);
734       if (ret == -1 && bytesRemaining <= extraLen) {
735         // We could not read the "extra data", but that is OK.
736         break;
737       }
738       if (ret < 0) {
739         throw new IOException("Premature EOF from inputStream (read "
740             + "returned " + ret + ", was trying to read " + necessaryLen
741             + " necessary bytes and " + extraLen + " extra bytes, "
742             + "successfully read "
743             + (necessaryLen + extraLen - bytesRemaining));
744       }
745       bufOffset += ret;
746       bytesRemaining -= ret;
747     }
748     return bytesRemaining <= 0;
749   }
750 
751   /**
752    * Read from an input stream at least <code>necessaryLen</code> and if possible,
753    * <code>extraLen</code> also if available. Analogous to
754    * {@link IOUtils#readFully(InputStream, byte[], int, int)}, but uses
755    * positional read and specifies a number of "extra" bytes that would be
756    * desirable but not absolutely necessary to read.
757    *
758    * @param in the input stream to read from
759    * @param position the position within the stream from which to start reading
760    * @param buf the buffer to read into
761    * @param bufOffset the destination offset in the buffer
762    * @param necessaryLen the number of bytes that are absolutely necessary to
763    *     read
764    * @param extraLen the number of extra bytes that would be nice to read
765    * @return true if and only if extraLen is > 0 and reading those extra bytes
766    *     was successful
767    * @throws IOException if failed to read the necessary bytes
768    */
769   static boolean positionalReadWithExtra(FSDataInputStream in,
770       long position, byte[] buf, int bufOffset, int necessaryLen, int extraLen)
771       throws IOException {
772     int bytesRemaining = necessaryLen + extraLen;
773     int bytesRead = 0;
774     while (bytesRead < necessaryLen) {
775       int ret = in.read(position, buf, bufOffset, bytesRemaining);
776       if (ret < 0) {
777         throw new IOException("Premature EOF from inputStream (positional read "
778             + "returned " + ret + ", was trying to read " + necessaryLen
779             + " necessary bytes and " + extraLen + " extra bytes, "
780             + "successfully read " + bytesRead);
781       }
782       position += ret;
783       bufOffset += ret;
784       bytesRemaining -= ret;
785       bytesRead += ret;
786     }
787     return bytesRead != necessaryLen && bytesRemaining <= 0;
788   }
789 
790   /**
791    * Unified version 2 {@link HFile} block writer. The intended usage pattern
792    * is as follows:
793    * <ol>
794    * <li>Construct an {@link HFileBlock.Writer}, providing a compression algorithm.
795    * <li>Call {@link Writer#startWriting} and get a data stream to write to.
796    * <li>Write your data into the stream.
797    * <li>Call Writer#writeHeaderAndData(FSDataOutputStream) as many times as you need to.
798    * store the serialized block into an external stream.
799    * <li>Repeat to write more blocks.
800    * </ol>
801    * <p>
802    */
803   public static class Writer {
804     private enum State {
805       INIT,
806       WRITING,
807       BLOCK_READY
808     };
809 
810     /** Writer state. Used to ensure the correct usage protocol. */
811     private State state = State.INIT;
812 
813     /** Data block encoder used for data blocks */
814     private final HFileDataBlockEncoder dataBlockEncoder;
815 
816     private HFileBlockEncodingContext dataBlockEncodingCtx;
817 
818     /** block encoding context for non-data blocks*/
819     private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx;
820 
821     /**
822      * The stream we use to accumulate data into a block in an uncompressed format.
823      * We reset this stream at the end of each block and reuse it. The
824      * header is written as the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this
825      * stream.
826      */
827     private ByteArrayOutputStream baosInMemory;
828 
829     /**
830      * Current block type. Set in {@link #startWriting(BlockType)}. Could be
831      * changed in {@link #finishBlock()} from {@link BlockType#DATA}
832      * to {@link BlockType#ENCODED_DATA}.
833      */
834     private BlockType blockType;
835 
836     /**
837      * A stream that we write uncompressed bytes to, which compresses them and
838      * writes them to {@link #baosInMemory}.
839      */
840     private DataOutputStream userDataStream;
841 
842     // Size of actual data being written. Not considering the block encoding/compression. This
843     // includes the header size also.
844     private int unencodedDataSizeWritten;
845 
846     // Size of actual data being written. considering the block encoding. This
847     // includes the header size also.
848     private int encodedDataSizeWritten;
849 
850     /**
851      * Bytes to be written to the file system, including the header. Compressed
852      * if compression is turned on. It also includes the checksum data that
853      * immediately follows the block data. (header + data + checksums)
854      */
855     private ByteArrayOutputStream onDiskBlockBytesWithHeader;
856 
857     /**
858      * The size of the checksum data on disk. It is used only if data is
859      * not compressed. If data is compressed, then the checksums are already
860      * part of onDiskBytesWithHeader. If data is uncompressed, then this
861      * variable stores the checksum data for this block.
862      */
863     private byte[] onDiskChecksum = HConstants.EMPTY_BYTE_ARRAY;
864 
865     /**
866      * Current block's start offset in the {@link HFile}. Set in
867      * {@link #writeHeaderAndData(FSDataOutputStream)}.
868      */
869     private long startOffset;
870 
871     /**
872      * Offset of previous block by block type. Updated when the next block is
873      * started.
874      */
875     private long[] prevOffsetByType;
876 
877     /** The offset of the previous block of the same type */
878     private long prevOffset;
879     /** Meta data that holds information about the hfileblock**/
880     private HFileContext fileContext;
881 
882     /**
883      * @param dataBlockEncoder data block encoding algorithm to use
884      */
885     public Writer(HFileDataBlockEncoder dataBlockEncoder, HFileContext fileContext) {
886       if (fileContext.getBytesPerChecksum() < HConstants.HFILEBLOCK_HEADER_SIZE) {
887         throw new RuntimeException("Unsupported value of bytesPerChecksum. " +
888             " Minimum is " + HConstants.HFILEBLOCK_HEADER_SIZE + " but the configured value is " +
889             fileContext.getBytesPerChecksum());
890       }
891       this.dataBlockEncoder = dataBlockEncoder != null?
892           dataBlockEncoder: NoOpDataBlockEncoder.INSTANCE;
893       this.dataBlockEncodingCtx = this.dataBlockEncoder.
894           newDataBlockEncodingContext(HConstants.HFILEBLOCK_DUMMY_HEADER, fileContext);
895       // TODO: This should be lazily instantiated since we usually do NOT need this default encoder
896       this.defaultBlockEncodingCtx = new HFileBlockDefaultEncodingContext(null,
897           HConstants.HFILEBLOCK_DUMMY_HEADER, fileContext);
898       // TODO: Set BAOS initial size. Use fileContext.getBlocksize() and add for header/checksum
899       baosInMemory = new ByteArrayOutputStream();
900       prevOffsetByType = new long[BlockType.values().length];
901       for (int i = 0; i < prevOffsetByType.length; ++i) {
902         prevOffsetByType[i] = UNSET;
903       }
904       // TODO: Why fileContext saved away when we have dataBlockEncoder and/or
905       // defaultDataBlockEncoder?
906       this.fileContext = fileContext;
907     }
908 
909     /**
910      * Starts writing into the block. The previous block's data is discarded.
911      *
912      * @return the stream the user can write their data into
913      * @throws IOException
914      */
915     DataOutputStream startWriting(BlockType newBlockType)
916         throws IOException {
917       if (state == State.BLOCK_READY && startOffset != -1) {
918         // We had a previous block that was written to a stream at a specific
919         // offset. Save that offset as the last offset of a block of that type.
920         prevOffsetByType[blockType.getId()] = startOffset;
921       }
922 
923       startOffset = -1;
924       blockType = newBlockType;
925 
926       baosInMemory.reset();
927       baosInMemory.write(HConstants.HFILEBLOCK_DUMMY_HEADER);
928 
929       state = State.WRITING;
930 
931       // We will compress it later in finishBlock()
932       userDataStream = new DataOutputStream(baosInMemory);
933       if (newBlockType == BlockType.DATA) {
934         this.dataBlockEncoder.startBlockEncoding(dataBlockEncodingCtx, userDataStream);
935       }
936       this.unencodedDataSizeWritten = 0;
937       this.encodedDataSizeWritten = 0;
938       return userDataStream;
939     }
940 
941     /**
942      * Writes the Cell to this block
943      * @param cell
944      * @throws IOException
945      */
946     void write(Cell cell) throws IOException{
947       expectState(State.WRITING);
948       int posBeforeEncode = this.userDataStream.size();
949       this.unencodedDataSizeWritten +=
950           this.dataBlockEncoder.encode(cell, dataBlockEncodingCtx, this.userDataStream);
951       this.encodedDataSizeWritten += this.userDataStream.size() - posBeforeEncode;
952     }
953 
954     /**
955      * Returns the stream for the user to write to. The block writer takes care
956      * of handling compression and buffering for caching on write. Can only be
957      * called in the "writing" state.
958      *
959      * @return the data output stream for the user to write to
960      */
961     DataOutputStream getUserDataStream() {
962       expectState(State.WRITING);
963       return userDataStream;
964     }
965 
966     /**
967      * Transitions the block writer from the "writing" state to the "block
968      * ready" state.  Does nothing if a block is already finished.
969      */
970     void ensureBlockReady() throws IOException {
971       Preconditions.checkState(state != State.INIT,
972           "Unexpected state: " + state);
973 
974       if (state == State.BLOCK_READY) {
975         return;
976       }
977 
978       // This will set state to BLOCK_READY.
979       finishBlock();
980     }
981 
982     /**
983      * Finish up writing of the block.
984      * Flushes the compressing stream (if using compression), fills out the header,
985      * does any compression/encryption of bytes to flush out to disk, and manages
986      * the cache on write content, if applicable. Sets block write state to "block ready".
987      */
988     private void finishBlock() throws IOException {
989       if (blockType == BlockType.DATA) {
990         this.dataBlockEncoder.endBlockEncoding(dataBlockEncodingCtx, userDataStream,
991             baosInMemory.getBuffer(), blockType);
992         blockType = dataBlockEncodingCtx.getBlockType();
993       }
994       userDataStream.flush();
995       prevOffset = prevOffsetByType[blockType.getId()];
996 
997       // We need to set state before we can package the block up for cache-on-write. In a way, the
998       // block is ready, but not yet encoded or compressed.
999       state = State.BLOCK_READY;
1000       Bytes compressAndEncryptDat;
1001       if (blockType == BlockType.DATA || blockType == BlockType.ENCODED_DATA) {
1002         compressAndEncryptDat = dataBlockEncodingCtx.
1003             compressAndEncrypt(baosInMemory.getBuffer(),
1004                     0, baosInMemory.size());
1005       } else {
1006         compressAndEncryptDat = defaultBlockEncodingCtx.
1007             compressAndEncrypt(baosInMemory.getBuffer(),
1008                     0, baosInMemory.size());
1009       }
1010       if (compressAndEncryptDat == null) {
1011         compressAndEncryptDat = new Bytes(baosInMemory.getBuffer(),
1012           0, baosInMemory.size());
1013       }
1014       if (onDiskBlockBytesWithHeader == null) {
1015         onDiskBlockBytesWithHeader = new ByteArrayOutputStream(compressAndEncryptDat.getLength());
1016       }
1017       onDiskBlockBytesWithHeader.reset();
1018       onDiskBlockBytesWithHeader.write(compressAndEncryptDat.get(),
1019             compressAndEncryptDat.getOffset(), compressAndEncryptDat.getLength());
1020       // Calculate how many bytes we need for checksum on the tail of the block.
1021       int numBytes = (int) ChecksumUtil.numBytes(
1022           onDiskBlockBytesWithHeader.size(),
1023           fileContext.getBytesPerChecksum());
1024 
1025       // Put the header for the on disk bytes; header currently is unfilled-out
1026       putHeader(onDiskBlockBytesWithHeader,
1027           onDiskBlockBytesWithHeader.size() + numBytes,
1028           baosInMemory.size(), onDiskBlockBytesWithHeader.size());
1029       // Set the header for the uncompressed bytes (for cache-on-write)
1030       if (onDiskChecksum.length != numBytes) {
1031         onDiskChecksum = new byte[numBytes];
1032       }
1033       ChecksumUtil.generateChecksums(
1034           onDiskBlockBytesWithHeader.getBuffer(), 0,onDiskBlockBytesWithHeader.size(),
1035           onDiskChecksum, 0, fileContext.getChecksumType(), fileContext.getBytesPerChecksum());
1036     }
1037     private void putHeader(ByteArrayOutputStream dest, int onDiskSize,
1038       int uncompressedSize, int onDiskDataSize) {
1039       putHeader(dest.getBuffer(),0, onDiskSize, uncompressedSize, onDiskDataSize);
1040     }
1041     /**
1042      * Put the header into the given byte array at the given offset.
1043      * @param onDiskSize size of the block on disk header + data + checksum
1044      * @param uncompressedSize size of the block after decompression (but
1045      *          before optional data block decoding) including header
1046      * @param onDiskDataSize size of the block on disk with header
1047      *        and data but not including the checksums
1048      */
1049     private void putHeader(byte[] dest, int offset, int onDiskSize,
1050         int uncompressedSize, int onDiskDataSize) {
1051       offset = blockType.put(dest, offset);
1052       offset = Bytes.putInt(dest, offset, onDiskSize - HConstants.HFILEBLOCK_HEADER_SIZE);
1053       offset = Bytes.putInt(dest, offset, uncompressedSize - HConstants.HFILEBLOCK_HEADER_SIZE);
1054       offset = Bytes.putLong(dest, offset, prevOffset);
1055       offset = Bytes.putByte(dest, offset, fileContext.getChecksumType().getCode());
1056       offset = Bytes.putInt(dest, offset, fileContext.getBytesPerChecksum());
1057       Bytes.putInt(dest, offset, onDiskDataSize);
1058     }
1059 
1060     /**
1061      * Similar to {@link #writeHeaderAndData(FSDataOutputStream)}, but records
1062      * the offset of this block so that it can be referenced in the next block
1063      * of the same type.
1064      *
1065      * @param out
1066      * @throws IOException
1067      */
1068     void writeHeaderAndData(FSDataOutputStream out) throws IOException {
1069       long offset = out.getPos();
1070       if (startOffset != UNSET && offset != startOffset) {
1071         throw new IOException("A " + blockType + " block written to a "
1072             + "stream twice, first at offset " + startOffset + ", then at "
1073             + offset);
1074       }
1075       startOffset = offset;
1076 
1077       finishBlockAndWriteHeaderAndData((DataOutputStream) out);
1078     }
1079 
1080     /**
1081      * Writes the header and the compressed data of this block (or uncompressed
1082      * data when not using compression) into the given stream. Can be called in
1083      * the "writing" state or in the "block ready" state. If called in the
1084      * "writing" state, transitions the writer to the "block ready" state.
1085      *
1086      * @param out the output stream to write the
1087      * @throws IOException
1088      */
1089     protected void finishBlockAndWriteHeaderAndData(DataOutputStream out)
1090       throws IOException {
1091       ensureBlockReady();
1092       long startTime = System.currentTimeMillis();
1093       out.write(onDiskBlockBytesWithHeader.getBuffer(), 0, onDiskBlockBytesWithHeader.size());
1094       out.write(onDiskChecksum);
1095       HFile.updateWriteLatency(System.currentTimeMillis() - startTime);
1096     }
1097 
1098     /**
1099      * Returns the header or the compressed data (or uncompressed data when not
1100      * using compression) as a byte array. Can be called in the "writing" state
1101      * or in the "block ready" state. If called in the "writing" state,
1102      * transitions the writer to the "block ready" state. This returns
1103      * the header + data + checksums stored on disk.
1104      *
1105      * @return header and data as they would be stored on disk in a byte array
1106      * @throws IOException
1107      */
1108     byte[] getHeaderAndDataForTest() throws IOException {
1109       ensureBlockReady();
1110       // This is not very optimal, because we are doing an extra copy.
1111       // But this method is used only by unit tests.
1112       byte[] output =
1113           new byte[onDiskBlockBytesWithHeader.size()
1114               + onDiskChecksum.length];
1115       System.arraycopy(onDiskBlockBytesWithHeader.getBuffer(), 0, output, 0,
1116           onDiskBlockBytesWithHeader.size());
1117       System.arraycopy(onDiskChecksum, 0, output,
1118           onDiskBlockBytesWithHeader.size(), onDiskChecksum.length);
1119       return output;
1120     }
1121 
1122     /**
1123      * Releases resources used by this writer.
1124      */
1125     void release() {
1126       if (dataBlockEncodingCtx != null) {
1127         dataBlockEncodingCtx.close();
1128         dataBlockEncodingCtx = null;
1129       }
1130       if (defaultBlockEncodingCtx != null) {
1131         defaultBlockEncodingCtx.close();
1132         defaultBlockEncodingCtx = null;
1133       }
1134     }
1135 
1136     /**
1137      * Returns the on-disk size of the data portion of the block. This is the
1138      * compressed size if compression is enabled. Can only be called in the
1139      * "block ready" state. Header is not compressed, and its size is not
1140      * included in the return value.
1141      *
1142      * @return the on-disk size of the block, not including the header.
1143      */
1144     int getOnDiskSizeWithoutHeader() {
1145       expectState(State.BLOCK_READY);
1146       return onDiskBlockBytesWithHeader.size() +
1147           onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE;
1148     }
1149 
1150     /**
1151      * Returns the on-disk size of the block. Can only be called in the
1152      * "block ready" state.
1153      *
1154      * @return the on-disk size of the block ready to be written, including the
1155      *         header size, the data and the checksum data.
1156      */
1157     int getOnDiskSizeWithHeader() {
1158       expectState(State.BLOCK_READY);
1159       return onDiskBlockBytesWithHeader.size() + onDiskChecksum.length;
1160     }
1161 
1162     /**
1163      * The uncompressed size of the block data. Does not include header size.
1164      */
1165     int getUncompressedSizeWithoutHeader() {
1166       expectState(State.BLOCK_READY);
1167       return baosInMemory.size() - HConstants.HFILEBLOCK_HEADER_SIZE;
1168     }
1169 
1170     /**
1171      * The uncompressed size of the block data, including header size.
1172      */
1173     int getUncompressedSizeWithHeader() {
1174       expectState(State.BLOCK_READY);
1175       return baosInMemory.size();
1176     }
1177 
1178     /** @return true if a block is being written  */
1179     boolean isWriting() {
1180       return state == State.WRITING;
1181     }
1182 
1183     /**
1184      * Returns the number of bytes written into the current block so far, or
1185      * zero if not writing the block at the moment. Note that this will return
1186      * zero in the "block ready" state as well.
1187      *
1188      * @return the number of bytes written
1189      */
1190     public int encodedBlockSizeWritten() {
1191       if (state != State.WRITING)
1192         return 0;
1193       return this.encodedDataSizeWritten;
1194     }
1195 
1196     /**
1197      * Returns the number of bytes written into the current block so far, or
1198      * zero if not writing the block at the moment. Note that this will return
1199      * zero in the "block ready" state as well.
1200      *
1201      * @return the number of bytes written
1202      */
1203     int blockSizeWritten() {
1204       if (state != State.WRITING) return 0;
1205       return this.unencodedDataSizeWritten;
1206     }
1207 
1208     /**
1209      * Clones the header followed by the uncompressed data, even if using
1210      * compression. This is needed for storing uncompressed blocks in the block
1211      * cache. Can be called in the "writing" state or the "block ready" state.
1212      * Returns only the header and data, does not include checksum data.
1213      *
1214      * @return Returns a copy of uncompressed block bytes for caching on write
1215      */
1216     ByteBuffer cloneUncompressedBufferWithHeader() {
1217       expectState(State.BLOCK_READY);
1218       byte[] uncompressedBlockBytesWithHeader = baosInMemory.toByteArray();
1219       int numBytes = (int) ChecksumUtil.numBytes(
1220           onDiskBlockBytesWithHeader.size(),
1221           fileContext.getBytesPerChecksum());
1222       putHeader(uncompressedBlockBytesWithHeader, 0,
1223         onDiskBlockBytesWithHeader.size() + numBytes,
1224         uncompressedBlockBytesWithHeader.length, onDiskBlockBytesWithHeader.size());
1225       return ByteBuffer.wrap(uncompressedBlockBytesWithHeader);
1226     }
1227 
1228     /**
1229      * Clones the header followed by the on-disk (compressed/encoded/encrypted) data. This is
1230      * needed for storing packed blocks in the block cache. Expects calling semantics identical to
1231      * {@link #getUncompressedBufferWithHeader()}. Returns only the header and data,
1232      * Does not include checksum data.
1233      *
1234      * @return Returns a copy of block bytes for caching on write
1235      */
1236     private ByteBuffer cloneOnDiskBufferWithHeader() {
1237       expectState(State.BLOCK_READY);
1238       return ByteBuffer.wrap(onDiskBlockBytesWithHeader.toByteArray());
1239     }
1240 
1241     private void expectState(State expectedState) {
1242       if (state != expectedState) {
1243         throw new IllegalStateException("Expected state: " + expectedState +
1244             ", actual state: " + state);
1245       }
1246     }
1247 
1248     /**
1249      * Takes the given {@link BlockWritable} instance, creates a new block of
1250      * its appropriate type, writes the writable into this block, and flushes
1251      * the block into the output stream. The writer is instructed not to buffer
1252      * uncompressed bytes for cache-on-write.
1253      *
1254      * @param bw the block-writable object to write as a block
1255      * @param out the file system output stream
1256      * @throws IOException
1257      */
1258     void writeBlock(BlockWritable bw, FSDataOutputStream out)
1259         throws IOException {
1260       bw.writeToBlock(startWriting(bw.getBlockType()));
1261       writeHeaderAndData(out);
1262     }
1263 
1264     /**
1265      * Creates a new HFileBlock. Checksums have already been validated, so
1266      * the byte buffer passed into the constructor of this newly created
1267      * block does not have checksum data even though the header minor
1268      * version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a
1269      * 0 value in bytesPerChecksum. This method copies the on-disk or
1270      * uncompressed data to build the HFileBlock which is used only
1271      * while writing blocks and caching.
1272      *
1273      * <p>TODO: Should there be an option where a cache can ask that hbase preserve block
1274      * checksums for checking after a block comes out of the cache? Otehrwise, cache is responsible
1275      * for blocks being wholesome (ECC memory or if file-backed, it does checksumming).
1276      */
1277     HFileBlock getBlockForCaching(CacheConfig cacheConf) {
1278       HFileContext newContext = new HFileContextBuilder()
1279                                 .withBlockSize(fileContext.getBlocksize())
1280                                 .withBytesPerCheckSum(0)
1281                                 .withChecksumType(ChecksumType.NULL) // no checksums in cached data
1282                                 .withCompression(fileContext.getCompression())
1283                                 .withDataBlockEncoding(fileContext.getDataBlockEncoding())
1284                                 .withHBaseCheckSum(fileContext.isUseHBaseChecksum())
1285                                 .withCompressTags(fileContext.isCompressTags())
1286                                 .withIncludesMvcc(fileContext.isIncludesMvcc())
1287                                 .withIncludesTags(fileContext.isIncludesTags())
1288                                 .withColumnFamily(fileContext.getColumnFamily())
1289                                 .withTableName(fileContext.getTableName())
1290                                 .build();
1291        return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(),
1292           getUncompressedSizeWithoutHeader(), prevOffset,
1293           cacheConf.shouldCacheCompressed(blockType.getCategory())?
1294             cloneOnDiskBufferWithHeader() :
1295             cloneUncompressedBufferWithHeader(),
1296           FILL_HEADER, startOffset, UNSET,
1297           onDiskBlockBytesWithHeader.size() + onDiskChecksum.length, newContext);
1298     }
1299   }
1300 
1301   /** Something that can be written into a block. */
1302   interface BlockWritable {
1303 
1304     /** The type of block this data should use. */
1305     BlockType getBlockType();
1306 
1307     /**
1308      * Writes the block to the provided stream. Must not write any magic
1309      * records.
1310      *
1311      * @param out a stream to write uncompressed data into
1312      */
1313     void writeToBlock(DataOutput out) throws IOException;
1314   }
1315 
1316   // Block readers and writers
1317 
1318   /** An interface allowing to iterate {@link HFileBlock}s. */
1319   interface BlockIterator {
1320 
1321     /**
1322      * Get the next block, or null if there are no more blocks to iterate.
1323      */
1324     HFileBlock nextBlock() throws IOException;
1325 
1326     /**
1327      * Similar to {@link #nextBlock()} but checks block type, throws an
1328      * exception if incorrect, and returns the HFile block
1329      */
1330     HFileBlock nextBlockWithBlockType(BlockType blockType) throws IOException;
1331   }
1332 
1333   /** A full-fledged reader with iteration ability. */
1334   interface FSReader {
1335 
1336     /**
1337      * Reads the block at the given offset in the file with the given on-disk
1338      * size and uncompressed size.
1339      *
1340      * @param offset
1341      * @param onDiskSize the on-disk size of the entire block, including all
1342      *          applicable headers, or -1 if unknown
1343      * @return the newly read block
1344      */
1345     HFileBlock readBlockData(long offset, long onDiskSize, boolean pread, boolean updateMetrics)
1346         throws IOException;
1347 
1348     /**
1349      * Creates a block iterator over the given portion of the {@link HFile}.
1350      * The iterator returns blocks starting with offset such that offset &lt;=
1351      * startOffset &lt; endOffset. Returned blocks are always unpacked.
1352      *
1353      * @param startOffset the offset of the block to start iteration with
1354      * @param endOffset the offset to end iteration at (exclusive)
1355      * @return an iterator of blocks between the two given offsets
1356      */
1357     BlockIterator blockRange(long startOffset, long endOffset);
1358 
1359     /** Closes the backing streams */
1360     void closeStreams() throws IOException;
1361 
1362     /** Get a decoder for {@link BlockType#ENCODED_DATA} blocks from this file. */
1363     HFileBlockDecodingContext getBlockDecodingContext();
1364 
1365     /** Get the default decoder for blocks from this file. */
1366     HFileBlockDecodingContext getDefaultBlockDecodingContext();
1367 
1368     /**
1369      * To close the stream's socket. Note: This can be concurrently called from multiple threads and
1370      * implementation should take care of thread safety.
1371      */
1372     void unbufferStream();
1373   }
1374 
1375   /**
1376    * A common implementation of some methods of {@link FSReader} and some
1377    * tools for implementing HFile format version-specific block readers.
1378    */
1379   private abstract static class AbstractFSReader implements FSReader {
1380     /** Compression algorithm used by the {@link HFile} */
1381 
1382     /** The size of the file we are reading from, or -1 if unknown. */
1383     protected long fileSize;
1384 
1385     /** The size of the header */
1386     protected final int hdrSize;
1387 
1388     /** The filesystem used to access data */
1389     protected HFileSystem hfs;
1390 
1391     protected final Lock streamLock = new ReentrantLock();
1392 
1393     /** The default buffer size for our buffered streams */
1394     public static final int DEFAULT_BUFFER_SIZE = 1 << 20;
1395 
1396     protected HFileContext fileContext;
1397     // Cache the fileName
1398     protected String pathName;
1399 
1400     public AbstractFSReader(long fileSize, HFileSystem hfs, Path path, HFileContext fileContext)
1401         throws IOException {
1402       this.fileSize = fileSize;
1403       this.hfs = hfs;
1404       if (path != null) {
1405         this.pathName = path.toString();
1406       }
1407       this.fileContext = fileContext;
1408       this.hdrSize = headerSize(fileContext.isUseHBaseChecksum());
1409     }
1410 
1411     @Override
1412     public BlockIterator blockRange(final long startOffset,
1413         final long endOffset) {
1414       final FSReader owner = this; // handle for inner class
1415       return new BlockIterator() {
1416         private long offset = startOffset;
1417         // Cache length of next block. Current block has the length of next block in it.
1418         private long length = -1;
1419 
1420         @Override
1421         public HFileBlock nextBlock() throws IOException {
1422           if (offset >= endOffset) {
1423             return null;
1424           }
1425           HFileBlock b = readBlockData(offset, length, false, false);
1426           offset += b.getOnDiskSizeWithHeader();
1427           length = b.getNextBlockOnDiskSize();
1428           return b.unpack(fileContext, owner);
1429         }
1430 
1431         @Override
1432         public HFileBlock nextBlockWithBlockType(BlockType blockType)
1433             throws IOException {
1434           HFileBlock blk = nextBlock();
1435           if (blk.getBlockType() != blockType) {
1436             throw new IOException("Expected block of type " + blockType
1437                 + " but found " + blk.getBlockType());
1438           }
1439           return blk;
1440         }
1441       };
1442     }
1443 
1444     /**
1445      * Does a positional read or a seek and read into the given buffer. Returns
1446      * the on-disk size of the next block, or -1 if it could not be read/determined; e.g. EOF.
1447      *
1448      * @param dest destination buffer
1449      * @param destOffset offset into the destination buffer at where to put the bytes we read
1450      * @param size size of read
1451      * @param peekIntoNextBlock whether to read the next block's on-disk size
1452      * @param fileOffset position in the stream to read at
1453      * @param pread whether we should do a positional read
1454      * @param istream The input source of data
1455      * @return the on-disk size of the next block with header size included, or
1456      *         -1 if it could not be determined; if not -1, the <code>dest</code> INCLUDES the
1457      *         next header
1458      * @throws IOException
1459      */
1460     protected int readAtOffset(FSDataInputStream istream, byte [] dest, int destOffset, int size,
1461         boolean peekIntoNextBlock, long fileOffset, boolean pread) throws IOException {
1462       if (peekIntoNextBlock && destOffset + size + hdrSize > dest.length) {
1463         // We are asked to read the next block's header as well, but there is
1464         // not enough room in the array.
1465         throw new IOException("Attempted to read " + size + " bytes and " +
1466             hdrSize + " bytes of next header into a " + dest.length +
1467             "-byte array at offset " + destOffset);
1468       }
1469 
1470       if (!pread && streamLock.tryLock()) {
1471         // Seek + read. Better for scanning.
1472         try {
1473           HFileUtil.seekOnMultipleSources(istream, fileOffset);
1474           // TODO: do we need seek time latencies?
1475           long realOffset = istream.getPos();
1476           if (realOffset != fileOffset) {
1477             throw new IOException("Tried to seek to " + fileOffset + " to "
1478                 + "read " + size + " bytes, but pos=" + realOffset
1479                 + " after seek");
1480           }
1481 
1482           if (!peekIntoNextBlock) {
1483             IOUtils.readFully(istream, dest, destOffset, size);
1484             return -1;
1485           }
1486 
1487           // Try to read the next block header.
1488           if (!readWithExtra(istream, dest, destOffset, size, hdrSize)) {
1489             return -1;
1490           }
1491         } finally {
1492           streamLock.unlock();
1493         }
1494       } else {
1495         // Positional read. Better for random reads; or when the streamLock is already locked.
1496         int extraSize = peekIntoNextBlock ? hdrSize : 0;
1497         if (!positionalReadWithExtra(istream, fileOffset, dest, destOffset, size, extraSize)) {
1498           return -1;
1499         }
1500       }
1501 
1502       assert peekIntoNextBlock;
1503       return Bytes.toInt(dest, destOffset + size + BlockType.MAGIC_LENGTH) + hdrSize;
1504     }
1505 
1506   }
1507 
1508   /**
1509    * Data-structure to use caching the header of the NEXT block. Only works if next read
1510    * that comes in here is next in sequence in this block.
1511    *
1512    * When we read, we read current block and the next blocks' header. We do this so we have
1513    * the length of the next block to read if the hfile index is not available (rare).
1514    * TODO: Review!! This trick of reading next blocks header is a pain, complicates our
1515    * read path and I don't think it needed given it rare we don't have the block index
1516    * (it is 'normally' present, gotten from the hfile index). FIX!!!
1517   */
1518   private static class PrefetchedHeader {
1519     long offset = -1;
1520     byte [] header = new byte[HConstants.HFILEBLOCK_HEADER_SIZE];
1521     final ByteBuffer buf = ByteBuffer.wrap(header, 0, HConstants.HFILEBLOCK_HEADER_SIZE);
1522 
1523     @Override
1524     public String toString() {
1525       return "offset=" + this.offset + ", header=" + Bytes.toStringBinary(header);
1526     }
1527   }
1528 
1529   /**
1530    * Reads version 2 blocks from the filesystem.
1531    */
1532   static class FSReaderImpl extends AbstractFSReader {
1533     /** The file system stream of the underlying {@link HFile} that
1534      * does or doesn't do checksum validations in the filesystem */
1535     protected FSDataInputStreamWrapper streamWrapper;
1536 
1537     private HFileBlockDecodingContext encodedBlockDecodingCtx;
1538 
1539     /** Default context used when BlockType != {@link BlockType#ENCODED_DATA}. */
1540     private final HFileBlockDefaultDecodingContext defaultDecodingCtx;
1541 
1542     /**
1543      * Cache of the NEXT header after this. Check it is indeed next blocks header
1544      * before using it. TODO: Review. This overread into next block to fetch
1545      * next blocks header seems unnecessary given we usually get the block size
1546      * from the hfile index. Review!
1547      */
1548     private AtomicReference<PrefetchedHeader> prefetchedHeader =
1549       new AtomicReference<PrefetchedHeader>(new PrefetchedHeader());
1550 
1551     public FSReaderImpl(FSDataInputStreamWrapper stream, long fileSize, HFileSystem hfs, Path path,
1552         HFileContext fileContext) throws IOException {
1553       super(fileSize, hfs, path, fileContext);
1554       this.streamWrapper = stream;
1555       // Older versions of HBase didn't support checksum.
1556       this.streamWrapper.prepareForBlockReader(!fileContext.isUseHBaseChecksum());
1557       defaultDecodingCtx = new HFileBlockDefaultDecodingContext(fileContext);
1558       encodedBlockDecodingCtx = defaultDecodingCtx;
1559     }
1560 
1561     /**
1562      * A constructor that reads files with the latest minor version.
1563      * This is used by unit tests only.
1564      */
1565     FSReaderImpl(FSDataInputStream istream, long fileSize, HFileContext fileContext)
1566     throws IOException {
1567       this(new FSDataInputStreamWrapper(istream), fileSize, null, null, fileContext);
1568     }
1569 
1570     /**
1571      * Reads a version 2 block (version 1 blocks not supported and not expected). Tries to do as
1572      * little memory allocation as possible, using the provided on-disk size.
1573      *
1574      * @param offset the offset in the stream to read at
1575      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1576      *          the header, or -1 if unknown; i.e. when iterating over blocks reading
1577      *          in the file metadata info.
1578      * @param pread whether to use a positional read
1579      */
1580     @Override
1581     public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL, boolean pread,
1582                                     boolean updateMetrics) throws IOException {
1583       // Get a copy of the current state of whether to validate
1584       // hbase checksums or not for this read call. This is not
1585       // thread-safe but the one constaint is that if we decide
1586       // to skip hbase checksum verification then we are
1587       // guaranteed to use hdfs checksum verification.
1588       boolean doVerificationThruHBaseChecksum = streamWrapper.shouldUseHBaseChecksum();
1589       FSDataInputStream is = streamWrapper.getStream(doVerificationThruHBaseChecksum);
1590 
1591       HFileBlock blk = readBlockDataInternal(is, offset,
1592                          onDiskSizeWithHeaderL, pread,
1593                          doVerificationThruHBaseChecksum, updateMetrics);
1594       if (blk == null) {
1595         HFile.LOG.warn("HBase checksum verification failed for file " +
1596                        pathName + " at offset " +
1597                        offset + " filesize " + fileSize +
1598                        ". Retrying read with HDFS checksums turned on...");
1599 
1600         if (!doVerificationThruHBaseChecksum) {
1601           String msg = "HBase checksum verification failed for file " +
1602                        pathName + " at offset " +
1603                        offset + " filesize " + fileSize +
1604                        " but this cannot happen because doVerify is " +
1605                        doVerificationThruHBaseChecksum;
1606           HFile.LOG.warn(msg);
1607           throw new IOException(msg); // cannot happen case here
1608         }
1609         HFile.CHECKSUM_FAILURES.increment(); // update metrics
1610 
1611         // If we have a checksum failure, we fall back into a mode where
1612         // the next few reads use HDFS level checksums. We aim to make the
1613         // next CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD reads avoid
1614         // hbase checksum verification, but since this value is set without
1615         // holding any locks, it can so happen that we might actually do
1616         // a few more than precisely this number.
1617         is = this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD);
1618         doVerificationThruHBaseChecksum = false;
1619         blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, pread,
1620                                     doVerificationThruHBaseChecksum, updateMetrics);
1621         if (blk != null) {
1622           HFile.LOG.warn("HDFS checksum verification succeeded for file " +
1623                          pathName + " at offset " +
1624                          offset + " filesize " + fileSize);
1625         }
1626       }
1627       if (blk == null && !doVerificationThruHBaseChecksum) {
1628         String msg = "readBlockData failed, possibly due to " +
1629                      "checksum verification failed for file " + pathName +
1630                      " at offset " + offset + " filesize " + fileSize;
1631         HFile.LOG.warn(msg);
1632         throw new IOException(msg);
1633       }
1634 
1635       // If there is a checksum mismatch earlier, then retry with
1636       // HBase checksums switched off and use HDFS checksum verification.
1637       // This triggers HDFS to detect and fix corrupt replicas. The
1638       // next checksumOffCount read requests will use HDFS checksums.
1639       // The decrementing of this.checksumOffCount is not thread-safe,
1640       // but it is harmless because eventually checksumOffCount will be
1641       // a negative number.
1642       streamWrapper.checksumOk();
1643       return blk;
1644     }
1645 
1646      /**
1647      * @return Check <code>onDiskSizeWithHeaderL</code> size is healthy and then return it as an int
1648      * @throws IOException
1649      */
1650     private static int checkAndGetSizeAsInt(final long onDiskSizeWithHeaderL, final int hdrSize)
1651     throws IOException {
1652       if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1)
1653           || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) {
1654         throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL
1655             + ": expected to be at least " + hdrSize
1656             + " and at most " + Integer.MAX_VALUE + ", or -1");
1657       }
1658       return (int)onDiskSizeWithHeaderL;
1659     }
1660 
1661     /**
1662      * Check atomic reference cache for this block's header. Cache only good if next
1663      * read coming through is next in sequence in the block. We read next block's
1664      * header on the tail of reading the previous block to save a seek. Otherwise,
1665      * we have to do a seek to read the header before we can pull in the block OR
1666      * we have to backup the stream because we over-read (the next block's header).
1667      * @see PrefetchedHeader
1668      * @return The cached block header or null if not found.
1669      * @see #cacheNextBlockHeader(long, byte[], int, int)
1670      */
1671     private ByteBuffer getCachedHeader(final long offset) {
1672       PrefetchedHeader ph = this.prefetchedHeader.get();
1673       return ph != null && ph.offset == offset? ph.buf: null;
1674     }
1675 
1676     /**
1677      * Save away the next blocks header in atomic reference.
1678      * @see #getCachedHeader(long)
1679      * @see PrefetchedHeader
1680      */
1681     private void cacheNextBlockHeader(final long offset,
1682         final byte [] header, final int headerOffset, final int headerLength) {
1683       PrefetchedHeader ph = new PrefetchedHeader();
1684       ph.offset = offset;
1685       System.arraycopy(header, headerOffset, ph.header, 0, headerLength);
1686       this.prefetchedHeader.set(ph);
1687     }
1688 
1689     /**
1690      * Verify the passed in onDiskSizeWithHeader aligns with what is in the header else something
1691      * is not right.
1692      * @throws IOException
1693      */
1694     private void verifyOnDiskSizeMatchesHeader(final int passedIn, final ByteBuffer headerBuf,
1695         final long offset, boolean verifyChecksum)
1696     throws IOException {
1697       // Assert size provided aligns with what is in the header
1698       int fromHeader = getOnDiskSizeWithHeader(headerBuf, verifyChecksum);
1699       if (passedIn != fromHeader) {
1700         throw new IOException("Passed in onDiskSizeWithHeader=" + passedIn + " != " + fromHeader +
1701             ", offset=" + offset + ", fileContext=" + this.fileContext);
1702       }
1703     }
1704 
1705     /**
1706      * Reads a version 2 block.
1707      *
1708      * @param offset the offset in the stream to read at. Usually the
1709      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1710      *          the header and checksums if present or -1 if unknown (as a long). Can be -1
1711      *          if we are doing raw iteration of blocks as when loading up file metadata; i.e.
1712      *          the first read of a new file (TODO: Fix! See HBASE-17072). Usually non-null gotten
1713      *          from the file index.
1714      * @param pread whether to use a positional read
1715      * @param verifyChecksum Whether to use HBase checksums.
1716      *        If HBase checksum is switched off, then use HDFS checksum.
1717      * @return the HFileBlock or null if there is a HBase checksum mismatch
1718      */
1719     protected HFileBlock readBlockDataInternal(FSDataInputStream is, long offset,
1720         long onDiskSizeWithHeaderL, boolean pread, boolean verifyChecksum, boolean updateMetrics)
1721     throws IOException {
1722       if (offset < 0) {
1723         throw new IOException("Invalid offset=" + offset + " trying to read "
1724             + "block (onDiskSize=" + onDiskSizeWithHeaderL + ")");
1725       }
1726       int onDiskSizeWithHeader = checkAndGetSizeAsInt(onDiskSizeWithHeaderL, hdrSize);
1727       // Try and get cached header. Will serve us in rare case where onDiskSizeWithHeaderL is -1
1728       // and will save us having to seek the stream backwards to reread the header we
1729       // read the last time through here.
1730       ByteBuffer headerBuf = getCachedHeader(offset);
1731       if (LOG.isTraceEnabled()) {
1732         LOG.trace("Reading " + this.fileContext.getHFileName() + " at offset=" + offset +
1733           ", pread=" + pread + ", verifyChecksum=" + verifyChecksum + ", cachedHeader=" +
1734           headerBuf + ", onDiskSizeWithHeader=" + onDiskSizeWithHeader);
1735       }
1736       long startTime = System.currentTimeMillis();
1737       if (onDiskSizeWithHeader <= 0) {
1738         // We were not passed the block size. Need to get it from the header. If header was not in
1739         // cache, need to seek to pull it in. This is costly and should happen very rarely.
1740         // Currently happens on open of a hfile reader where we read the trailer blocks for
1741         // indices. Otherwise, we are reading block sizes out of the hfile index. To check,
1742         // enable TRACE in this file and you'll get an exception in a LOG every time we seek.
1743         // See HBASE-17072 for more detail.
1744         if (headerBuf == null) {
1745           if (LOG.isTraceEnabled()) {
1746             LOG.trace("Extra see to get block size!", new RuntimeException());
1747           }
1748           headerBuf = ByteBuffer.allocate(hdrSize);
1749           readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(), hdrSize, false,
1750               offset, pread);
1751         }
1752         onDiskSizeWithHeader = getOnDiskSizeWithHeader(headerBuf,
1753           this.fileContext.isUseHBaseChecksum());
1754       }
1755 
1756       int preReadHeaderSize = headerBuf == null? 0 : hdrSize;
1757       // Allocate enough space to fit the next block's header too; saves a seek next time through.
1758       // onDiskBlock is whole block + header + checksums then extra hdrSize to read next header;
1759       // onDiskSizeWithHeader is header, body, and any checksums if present. preReadHeaderSize
1760       // says where to start reading. If we have the header cached, then we don't need to read
1761       // it again and we can likely read from last place we left off w/o need to backup and reread
1762       // the header we read last time through here. TODO: Review this overread of the header. Is it necessary
1763       // when we get the block size from the hfile index? See note on PrefetchedHeader class above.
1764       byte [] onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize];
1765       int nextBlockOnDiskSize = readAtOffset(is, onDiskBlock, preReadHeaderSize,
1766           onDiskSizeWithHeader - preReadHeaderSize, true, offset + preReadHeaderSize, pread);
1767       if (headerBuf != null) {
1768         // The header has been read when reading the previous block OR in a distinct header-only
1769         // read. Copy to this block's header.
1770         System.arraycopy(headerBuf.array(), headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
1771       } else {
1772         headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize);
1773       }
1774 
1775       // Do a few checks before we go instantiate HFileBlock.
1776       assert onDiskSizeWithHeader > this.hdrSize;
1777       verifyOnDiskSizeMatchesHeader(onDiskSizeWithHeader, headerBuf, offset,
1778         this.fileContext.isUseHBaseChecksum());
1779       ByteBuffer onDiskBlockByteBuffer = ByteBuffer.wrap(onDiskBlock, 0, onDiskSizeWithHeader);
1780       // Verify checksum of the data before using it for building HFileBlock.
1781       if (verifyChecksum &&
1782           !validateChecksum(offset, onDiskBlockByteBuffer, hdrSize)) {
1783         return null;
1784       }
1785       long duration = System.currentTimeMillis() - startTime;
1786       if (updateMetrics) {
1787         HFile.updateReadLatency(duration, pread);
1788       }
1789       // The onDiskBlock will become the headerAndDataBuffer for this block.
1790       // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already
1791       // contains the header of next block, so no need to set next block's header in it.
1792       HFileBlock hFileBlock =
1793           new HFileBlock(onDiskBlockByteBuffer, this.fileContext.isUseHBaseChecksum(), offset,
1794               nextBlockOnDiskSize, fileContext);
1795       // Run check on uncompressed sizings.
1796       if (!fileContext.isCompressedOrEncrypted()) {
1797         hFileBlock.sanityCheckUncompressed();
1798       }
1799       if (LOG.isTraceEnabled()) {
1800         LOG.trace("Read " + hFileBlock + " in " + duration + " ns");
1801       }
1802       // Cache next block header if we read it for the next time through here.
1803       if (nextBlockOnDiskSize != -1) {
1804         cacheNextBlockHeader(offset + hFileBlock.getOnDiskSizeWithHeader(),
1805             onDiskBlock, onDiskSizeWithHeader, hdrSize);
1806       }
1807       return hFileBlock;
1808     }
1809 
1810     void setIncludesMemstoreTS(boolean includesMemstoreTS) {
1811       this.fileContext.setIncludesMvcc(includesMemstoreTS);
1812     }
1813 
1814     void setDataBlockEncoder(HFileDataBlockEncoder encoder) {
1815       encodedBlockDecodingCtx = encoder.newDataBlockDecodingContext(this.fileContext);
1816     }
1817 
1818     @Override
1819     public HFileBlockDecodingContext getBlockDecodingContext() {
1820       return this.encodedBlockDecodingCtx;
1821     }
1822 
1823     @Override
1824     public HFileBlockDecodingContext getDefaultBlockDecodingContext() {
1825       return this.defaultDecodingCtx;
1826     }
1827 
1828     /**
1829      * Generates the checksum for the header as well as the data and then validates it.
1830      * If the block doesn't uses checksum, returns false.
1831      * @return True if checksum matches, else false.
1832      */
1833     protected boolean validateChecksum(long offset, ByteBuffer data, int hdrSize)
1834         throws IOException {
1835       // If this is an older version of the block that does not have checksums, then return false
1836       // indicating that checksum verification did not succeed. Actually, this method should never
1837       // be called when the minorVersion is 0, thus this is a defensive check for a cannot-happen
1838       // case. Since this is a cannot-happen case, it is better to return false to indicate a
1839       // checksum validation failure.
1840       if (!fileContext.isUseHBaseChecksum()) {
1841         return false;
1842       }
1843       return ChecksumUtil.validateChecksum(data, pathName, offset, hdrSize);
1844     }
1845 
1846     @Override
1847     public void closeStreams() throws IOException {
1848       streamWrapper.close();
1849     }
1850 
1851     @Override
1852     public void unbufferStream() {
1853       // To handle concurrent reads, ensure that no other client is accessing the streams while we
1854       // unbuffer it.
1855       if (streamLock.tryLock()) {
1856         try {
1857           this.streamWrapper.unbuffer();
1858         } finally {
1859           streamLock.unlock();
1860         }
1861       }
1862     }
1863 
1864     @Override
1865     public String toString() {
1866       return "hfs=" + hfs + ", path=" + pathName + ", fileContext=" + fileContext;
1867     }
1868   }
1869 
1870   /** An additional sanity-check in case no compression or encryption is being used. */
1871   void sanityCheckUncompressed() throws IOException {
1872     if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader +
1873         totalChecksumBytes()) {
1874       throw new IOException("Using no compression but "
1875           + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
1876           + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
1877           + ", numChecksumbytes=" + totalChecksumBytes());
1878     }
1879   }
1880 
1881   // Cacheable implementation
1882   @Override
1883   public int getSerializedLength() {
1884     if (buf != null) {
1885       // Include extra bytes for block metadata.
1886       return this.buf.limit() + BLOCK_METADATA_SPACE;
1887     }
1888     return 0;
1889   }
1890 
1891   // Cacheable implementation
1892   @Override
1893   public void serialize(ByteBuffer destination, boolean includeNextBlockOnDiskSize) {
1894     ByteBufferUtils.copyFromBufferToBuffer(destination, this.buf, 0,
1895         getSerializedLength() - BLOCK_METADATA_SPACE);
1896     destination = addMetaData(destination, includeNextBlockOnDiskSize);
1897 
1898     // Make it ready for reading. flip sets position to zero and limit to current position which
1899     // is what we want if we do not want to serialize the block plus checksums if present plus
1900     // metadata.
1901     destination.flip();
1902   }
1903 
1904   /**
1905    * For use by bucketcache. This exposes internals.
1906    */
1907   public ByteBuffer getMetaData() {
1908     ByteBuffer bb = ByteBuffer.allocate(BLOCK_METADATA_SPACE);
1909     bb = addMetaData(bb, true);
1910     bb.flip();
1911     return bb;
1912   }
1913 
1914   /**
1915    * Adds metadata at current position (position is moved forward). Does not flip or reset.
1916    * @return The passed <code>destination</code> with metadata added.
1917    */
1918   private ByteBuffer addMetaData(final ByteBuffer destination, boolean includeNextBlockMetadata) {
1919     destination.put(this.fileContext.isUseHBaseChecksum() ? (byte) 1 : (byte) 0);
1920     destination.putLong(this.offset);
1921     if (includeNextBlockMetadata) {
1922       destination.putInt(this.nextBlockOnDiskSize);
1923     }
1924     return destination;
1925   }
1926 
1927   // Cacheable implementation
1928   @Override
1929   public CacheableDeserializer<Cacheable> getDeserializer() {
1930     return HFileBlock.BLOCK_DESERIALIZER;
1931   }
1932 
1933   @Override
1934   public int hashCode() {
1935     final int prime = 31;
1936     int result = 1;
1937     result = prime * result + ((blockType == null) ? 0 : blockType.hashCode());
1938     result = prime * result + ((buf == null) ? 0 : buf.hashCode());
1939     result = prime * result + ((fileContext == null) ? 0 : fileContext.hashCode());
1940     result = prime * result + nextBlockOnDiskSize;
1941     result = prime * result + (int) (offset ^ (offset >>> 32));
1942     result = prime * result + onDiskDataSizeWithHeader;
1943     result = prime * result + onDiskSizeWithoutHeader;
1944     result = prime * result + (int) (prevBlockOffset ^ (prevBlockOffset >>> 32));
1945     result = prime * result + uncompressedSizeWithoutHeader;
1946     return result;
1947   }
1948 
1949   @Override
1950   public boolean equals(Object comparison) {
1951     if (this == comparison) {
1952       return true;
1953     }
1954     if (comparison == null) {
1955       return false;
1956     }
1957     if (comparison.getClass() != this.getClass()) {
1958       return false;
1959     }
1960 
1961     HFileBlock castedComparison = (HFileBlock) comparison;
1962 
1963     if (castedComparison.blockType != this.blockType) {
1964       return false;
1965     }
1966     if (castedComparison.nextBlockOnDiskSize != this.nextBlockOnDiskSize) {
1967       return false;
1968     }
1969     // Offset is important. Needed when we have to remake cachekey when block is returned to cache.
1970     if (castedComparison.offset != this.offset) {
1971       return false;
1972     }
1973     if (castedComparison.onDiskSizeWithoutHeader != this.onDiskSizeWithoutHeader) {
1974       return false;
1975     }
1976     if (castedComparison.prevBlockOffset != this.prevBlockOffset) {
1977       return false;
1978     }
1979     if (castedComparison.uncompressedSizeWithoutHeader != this.uncompressedSizeWithoutHeader) {
1980       return false;
1981     }
1982     if (ByteBufferUtils.compareTo(this.buf, 0, this.buf.limit(), castedComparison.buf, 0,
1983         castedComparison.buf.limit()) != 0) {
1984       return false;
1985     }
1986     return true;
1987   }
1988 
1989   public DataBlockEncoding getDataBlockEncoding() {
1990     if (blockType == BlockType.ENCODED_DATA) {
1991       return DataBlockEncoding.getEncodingById(getDataBlockEncodingId());
1992     }
1993     return DataBlockEncoding.NONE;
1994   }
1995 
1996   byte getChecksumType() {
1997     return this.fileContext.getChecksumType().getCode();
1998   }
1999 
2000   int getBytesPerChecksum() {
2001     return this.fileContext.getBytesPerChecksum();
2002   }
2003 
2004   /** @return the size of data on disk + header. Excludes checksum. */
2005   int getOnDiskDataSizeWithHeader() {
2006     return this.onDiskDataSizeWithHeader;
2007   }
2008 
2009   /**
2010    * Calculate the number of bytes required to store all the checksums
2011    * for this block. Each checksum value is a 4 byte integer.
2012    */
2013   int totalChecksumBytes() {
2014     // If the hfile block has minorVersion 0, then there are no checksum
2015     // data to validate. Similarly, a zero value in this.bytesPerChecksum
2016     // indicates that cached blocks do not have checksum data because
2017     // checksums were already validated when the block was read from disk.
2018     if (!fileContext.isUseHBaseChecksum() || this.fileContext.getBytesPerChecksum() == 0) {
2019       return 0;
2020     }
2021     return (int) ChecksumUtil.numBytes(onDiskDataSizeWithHeader,
2022         this.fileContext.getBytesPerChecksum());
2023   }
2024 
2025   /**
2026    * Returns the size of this block header.
2027    */
2028   public int headerSize() {
2029     return headerSize(this.fileContext.isUseHBaseChecksum());
2030   }
2031 
2032   /**
2033    * Maps a minor version to the size of the header.
2034    */
2035   public static int headerSize(boolean usesHBaseChecksum) {
2036     return usesHBaseChecksum?
2037         HConstants.HFILEBLOCK_HEADER_SIZE: HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
2038   }
2039 
2040   /**
2041    * Return the appropriate DUMMY_HEADER for the minor version
2042    */
2043   byte[] getDummyHeaderForVersion() {
2044     return getDummyHeaderForVersion(this.fileContext.isUseHBaseChecksum());
2045   }
2046 
2047   /**
2048    * Return the appropriate DUMMY_HEADER for the minor version
2049    */
2050   static private byte[] getDummyHeaderForVersion(boolean usesHBaseChecksum) {
2051     return usesHBaseChecksum? HConstants.HFILEBLOCK_DUMMY_HEADER: DUMMY_HEADER_NO_CHECKSUM;
2052   }
2053 
2054   /**
2055    * @return This HFileBlocks fileContext which will a derivative of the
2056    * fileContext for the file from which this block's data was originally read.
2057    */
2058   HFileContext getHFileContext() {
2059     return this.fileContext;
2060   }
2061 
2062   /**
2063    * Convert the contents of the block header into a human readable string.
2064    * This is mostly helpful for debugging. This assumes that the block
2065    * has minor version > 0.
2066    */
2067   static String toStringHeader(ByteBuffer buf) throws IOException {
2068     byte[] magicBuf = new byte[Math.min(buf.limit() - buf.position(), BlockType.MAGIC_LENGTH)];
2069     buf.get(magicBuf);
2070     BlockType bt = BlockType.parse(magicBuf, 0, BlockType.MAGIC_LENGTH);
2071     int compressedBlockSizeNoHeader = buf.getInt();
2072     int uncompressedBlockSizeNoHeader = buf.getInt();
2073     long prevBlockOffset = buf.getLong();
2074     byte cksumtype = buf.get();
2075     long bytesPerChecksum = buf.getInt();
2076     long onDiskDataSizeWithHeader = buf.getInt();
2077     return " Header dump: magic: " + Bytes.toString(magicBuf) +
2078                    " blockType " + bt +
2079                    " compressedBlockSizeNoHeader " +
2080                    compressedBlockSizeNoHeader +
2081                    " uncompressedBlockSizeNoHeader " +
2082                    uncompressedBlockSizeNoHeader +
2083                    " prevBlockOffset " + prevBlockOffset +
2084                    " checksumType " + ChecksumType.codeToType(cksumtype) +
2085                    " bytesPerChecksum " + bytesPerChecksum +
2086                    " onDiskDataSizeWithHeader " + onDiskDataSizeWithHeader;
2087   }
2088 }