View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertTrue;
25  import static org.junit.Assert.fail;
26  import static org.mockito.Mockito.mock;
27  import static org.mockito.Mockito.when;
28  
29  import java.io.IOException;
30  import java.util.ArrayList;
31  import java.util.Collections;
32  import java.util.List;
33  import java.util.Random;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.fs.FileSystem;
39  import org.apache.hadoop.fs.Path;
40  import org.apache.hadoop.hbase.HBaseTestingUtility;
41  import org.apache.hadoop.hbase.HColumnDescriptor;
42  import org.apache.hadoop.hbase.KeyValue;
43  import org.apache.hadoop.hbase.testclassification.MediumTests;
44  import org.apache.hadoop.hbase.client.Scan;
45  import org.apache.hadoop.hbase.io.hfile.BlockCache;
46  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
47  import org.apache.hadoop.hbase.io.hfile.HFile;
48  import org.apache.hadoop.hbase.io.hfile.HFileContext;
49  import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
50  import org.apache.hadoop.hbase.io.hfile.TestHFileWriterV2;
51  import org.apache.hadoop.hbase.util.BloomFilterFactory;
52  import org.apache.hadoop.hbase.util.ByteBloomFilter;
53  import org.apache.hadoop.hbase.util.Bytes;
54  import org.apache.hadoop.hbase.util.CompoundBloomFilter;
55  import org.apache.hadoop.hbase.util.CompoundBloomFilterBase;
56  import org.apache.hadoop.hbase.util.CompoundBloomFilterWriter;
57  import org.junit.Before;
58  import org.junit.Test;
59  import org.junit.experimental.categories.Category;
60  
61  /**
62   * Tests writing Bloom filter blocks in the same part of the file as data
63   * blocks.
64   */
65  @Category(MediumTests.class)
66  public class TestCompoundBloomFilter {
67  
68    private static final HBaseTestingUtility TEST_UTIL =
69        new HBaseTestingUtility();
70  
71    private static final Log LOG = LogFactory.getLog(
72        TestCompoundBloomFilter.class);
73  
74    private static final int NUM_TESTS = 9;
75    private static final BloomType BLOOM_TYPES[] = { BloomType.ROW,
76        BloomType.ROW, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW,
77        BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW };
78  
79    private static final int NUM_KV[];
80    static {
81      final int N = 10000; // Only used in initialization.
82      NUM_KV = new int[] { 21870, N, N, N, N, 1000, N, 7500, 7500};
83      assert NUM_KV.length == NUM_TESTS;
84    }
85  
86    private static final int BLOCK_SIZES[];
87    static {
88      final int blkSize = 65536;
89      BLOCK_SIZES = new int[] { 512, 1000, blkSize, blkSize, blkSize, 128, 300,
90          blkSize, blkSize };
91      assert BLOCK_SIZES.length == NUM_TESTS;
92    }
93  
94    /**
95     * Be careful not to specify too high a Bloom filter block size, otherwise
96     * there will only be one oversized chunk and the observed false positive
97     * rate will be too low.
98     */
99    private static final int BLOOM_BLOCK_SIZES[] = { 1000, 4096, 4096, 4096,
100       8192, 128, 1024, 600, 600 };
101   static { assert BLOOM_BLOCK_SIZES.length == NUM_TESTS; }
102 
103   private static final double TARGET_ERROR_RATES[] = { 0.025, 0.01, 0.015,
104       0.01, 0.03, 0.01, 0.01, 0.07, 0.07 };
105   static { assert TARGET_ERROR_RATES.length == NUM_TESTS; }
106 
107   /** A false positive rate that is obviously too high. */
108   private static final double TOO_HIGH_ERROR_RATE;
109   static {
110     double m = 0;
111     for (double errorRate : TARGET_ERROR_RATES)
112       m = Math.max(m, errorRate);
113     TOO_HIGH_ERROR_RATE = m + 0.03;
114   }
115 
116   private static Configuration conf;
117   private static CacheConfig cacheConf;
118   private FileSystem fs;
119   private BlockCache blockCache;
120 
121   /** A message of the form "in test#<number>:" to include in logging. */
122   private String testIdMsg;
123 
124   private static final int GENERATION_SEED = 2319;
125   private static final int EVALUATION_SEED = 135;
126 
127   @Before
128   public void setUp() throws IOException {
129     conf = TEST_UTIL.getConfiguration();
130 
131     // This test requires the most recent HFile format (i.e. v2).
132     conf.setInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION);
133 
134     fs = FileSystem.get(conf);
135 
136     cacheConf = new CacheConfig(conf);
137     blockCache = cacheConf.getBlockCache();
138     assertNotNull(blockCache);
139   }
140 
141   private List<KeyValue> createSortedKeyValues(Random rand, int n) {
142     List<KeyValue> kvList = new ArrayList<KeyValue>(n);
143     for (int i = 0; i < n; ++i)
144       kvList.add(TestHFileWriterV2.randomKeyValue(rand));
145     Collections.sort(kvList, KeyValue.COMPARATOR);
146     return kvList;
147   }
148 
149   @Test
150   public void testCompoundBloomFilter() throws IOException {
151     conf.setBoolean(BloomFilterFactory.IO_STOREFILE_BLOOM_ENABLED, true);
152     for (int t = 0; t < NUM_TESTS; ++t) {
153       conf.setFloat(BloomFilterFactory.IO_STOREFILE_BLOOM_ERROR_RATE,
154           (float) TARGET_ERROR_RATES[t]);
155 
156       testIdMsg = "in test #" + t + ":";
157       Random generationRand = new Random(GENERATION_SEED);
158       List<KeyValue> kvs = createSortedKeyValues(generationRand, NUM_KV[t]);
159       BloomType bt = BLOOM_TYPES[t];
160       Path sfPath = writeStoreFile(t, bt, kvs);
161       readStoreFile(t, bt, kvs, sfPath);
162     }
163   }
164 
165   /**
166    * Validates the false positive ratio by computing its z-value and comparing
167    * it to the provided threshold.
168    *
169    * @param falsePosRate experimental positive rate
170    * @param nTrials the number of Bloom filter checks
171    * @param zValueBoundary z-value boundary, positive for an upper bound and
172    *          negative for a lower bound
173    * @param cbf the compound Bloom filter we are using
174    * @param additionalMsg additional message to include in log output and
175    *          assertion failures
176    */
177   private void validateFalsePosRate(double falsePosRate, int nTrials,
178       double zValueBoundary, CompoundBloomFilter cbf, String additionalMsg) {
179     double p = BloomFilterFactory.getErrorRate(conf);
180     double zValue = (falsePosRate - p) / Math.sqrt(p * (1 - p) / nTrials);
181 
182     String assortedStatsStr = " (targetErrorRate=" + p + ", falsePosRate="
183         + falsePosRate + ", nTrials=" + nTrials + ")";
184     LOG.info("z-value is " + zValue + assortedStatsStr);
185 
186     boolean isUpperBound = zValueBoundary > 0;
187 
188     if (isUpperBound && zValue > zValueBoundary ||
189         !isUpperBound && zValue < zValueBoundary) {
190       String errorMsg = "False positive rate z-value " + zValue + " is "
191           + (isUpperBound ? "higher" : "lower") + " than " + zValueBoundary
192           + assortedStatsStr + ". Per-chunk stats:\n"
193           + cbf.formatTestingStats();
194       fail(errorMsg + additionalMsg);
195     }
196   }
197 
198   private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs,
199       Path sfPath) throws IOException {
200     StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt);
201     StoreFile.Reader r = sf.createReader();
202     final boolean pread = true; // does not really matter
203     StoreFileScanner scanner = r.getStoreFileScanner(true, pread, false, 0, 0, false);
204 
205     {
206       // Test for false negatives (not allowed).
207       int numChecked = 0;
208       for (KeyValue kv : kvs) {
209         byte[] row = kv.getRow();
210         boolean present = isInBloom(scanner, row, kv.getQualifier());
211         assertTrue(testIdMsg + " Bloom filter false negative on row "
212             + Bytes.toStringBinary(row) + " after " + numChecked
213             + " successful checks", present);
214         ++numChecked;
215       }
216     }
217 
218     // Test for false positives (some percentage allowed). We test in two modes:
219     // "fake lookup" which ignores the key distribution, and production mode.
220     for (boolean fakeLookupEnabled : new boolean[] { true, false }) {
221       if (fakeLookupEnabled) {
222         ByteBloomFilter.setRandomGeneratorForTest(new Random(283742987L));
223       }
224       try {
225         String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ?
226             "enabled" : "disabled");
227         CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter();
228         cbf.enableTestingStats();
229         int numFalsePos = 0;
230         Random rand = new Random(EVALUATION_SEED);
231         int nTrials = NUM_KV[t] * 10;
232         for (int i = 0; i < nTrials; ++i) {
233           byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand);
234           if (isInBloom(scanner, query, bt, rand)) {
235             numFalsePos += 1;
236           }
237         }
238         double falsePosRate = numFalsePos * 1.0 / nTrials;
239         LOG.debug(String.format(testIdMsg
240             + " False positives: %d out of %d (%f)",
241             numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr);
242 
243         // Check for obvious Bloom filter crashes.
244         assertTrue("False positive is too high: " + falsePosRate + " (greater "
245             + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr,
246             falsePosRate < TOO_HIGH_ERROR_RATE);
247 
248         // Now a more precise check to see if the false positive rate is not
249         // too high. The reason we use a relaxed restriction for the real-world
250         // case as opposed to the "fake lookup" case is that our hash functions
251         // are not completely independent.
252 
253         double maxZValue = fakeLookupEnabled ? 1.96 : 2.5;
254         validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf,
255             fakeLookupModeStr);
256 
257         // For checking the lower bound we need to eliminate the last chunk,
258         // because it is frequently smaller and the false positive rate in it
259         // is too low. This does not help if there is only one under-sized
260         // chunk, though.
261         int nChunks = cbf.getNumChunks();
262         if (nChunks > 1) {
263           numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1);
264           nTrials -= cbf.getNumQueriesForTesting(nChunks - 1);
265           falsePosRate = numFalsePos * 1.0 / nTrials;
266           LOG.info(testIdMsg + " False positive rate without last chunk is " +
267               falsePosRate + fakeLookupModeStr);
268         }
269 
270         validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf,
271             fakeLookupModeStr);
272       } finally {
273         ByteBloomFilter.setRandomGeneratorForTest(null);
274       }
275     }
276 
277     r.close(true); // end of test so evictOnClose
278   }
279 
280   private boolean isInBloom(StoreFileScanner scanner, byte[] row, BloomType bt,
281       Random rand) {
282     return isInBloom(scanner, row, TestHFileWriterV2.randomRowOrQualifier(rand));
283   }
284 
285   private boolean isInBloom(StoreFileScanner scanner, byte[] row,
286       byte[] qualifier) {
287     Scan scan = new Scan(row, row);
288     scan.addColumn(Bytes.toBytes(TestHFileWriterV2.COLUMN_FAMILY_NAME), qualifier);
289     Store store = mock(Store.class);
290     HColumnDescriptor hcd = mock(HColumnDescriptor.class);
291     when(hcd.getName()).thenReturn(Bytes.toBytes(TestHFileWriterV2.COLUMN_FAMILY_NAME));
292     when(store.getFamily()).thenReturn(hcd);
293     return scanner.shouldUseScanner(scan, store, Long.MIN_VALUE);
294   }
295 
296   private Path writeStoreFile(int t, BloomType bt, List<KeyValue> kvs)
297       throws IOException {
298     conf.setInt(BloomFilterFactory.IO_STOREFILE_BLOOM_BLOCK_SIZE,
299         BLOOM_BLOCK_SIZES[t]);
300     conf.setBoolean(CacheConfig.CACHE_BLOCKS_ON_WRITE_KEY, true);
301     cacheConf = new CacheConfig(conf);
302     HFileContext meta = new HFileContextBuilder().withBlockSize(BLOCK_SIZES[t]).build();
303     StoreFile.Writer w = new StoreFile.WriterBuilder(conf, cacheConf, fs)
304             .withOutputDir(TEST_UTIL.getDataTestDir())
305             .withBloomType(bt)
306             .withFileContext(meta)
307             .build();
308 
309     assertTrue(w.hasGeneralBloom());
310     assertTrue(w.getGeneralBloomWriter() instanceof CompoundBloomFilterWriter);
311     CompoundBloomFilterWriter cbbf =
312         (CompoundBloomFilterWriter) w.getGeneralBloomWriter();
313 
314     int keyCount = 0;
315     KeyValue prev = null;
316     LOG.debug("Total keys/values to insert: " + kvs.size());
317     for (KeyValue kv : kvs) {
318       w.append(kv);
319 
320       // Validate the key count in the Bloom filter.
321       boolean newKey = true;
322       if (prev != null) {
323         newKey = !(bt == BloomType.ROW ? KeyValue.COMPARATOR.matchingRows(kv,
324             prev) : KeyValue.COMPARATOR.matchingRowColumn(kv, prev));
325       }
326       if (newKey)
327         ++keyCount;
328       assertEquals(keyCount, cbbf.getKeyCount());
329 
330       prev = kv;
331     }
332     w.close();
333 
334     return w.getPath();
335   }
336 
337   @Test
338   public void testCompoundBloomSizing() {
339     int bloomBlockByteSize = 4096;
340     int bloomBlockBitSize = bloomBlockByteSize * 8;
341     double targetErrorRate = 0.01;
342     long maxKeysPerChunk = ByteBloomFilter.idealMaxKeys(bloomBlockBitSize,
343         targetErrorRate);
344 
345     long bloomSize1 = bloomBlockByteSize * 8;
346     long bloomSize2 = ByteBloomFilter.computeBitSize(maxKeysPerChunk,
347         targetErrorRate);
348 
349     double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1);
350     assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001);
351   }
352 
353   @Test
354   public void testCreateKey() {
355     CompoundBloomFilterBase cbfb = new CompoundBloomFilterBase();
356     byte[] row = "myRow".getBytes();
357     byte[] qualifier = "myQualifier".getBytes();
358     byte[] rowKey = cbfb.createBloomKey(row, 0, row.length,
359         row, 0, 0);
360     byte[] rowColKey = cbfb.createBloomKey(row, 0, row.length,
361         qualifier, 0, qualifier.length);
362     KeyValue rowKV = KeyValue.createKeyValueFromKey(rowKey);
363     KeyValue rowColKV = KeyValue.createKeyValueFromKey(rowColKey);
364     assertEquals(rowKV.getTimestamp(), rowColKV.getTimestamp());
365     assertEquals(Bytes.toStringBinary(rowKV.getRow()),
366         Bytes.toStringBinary(rowColKV.getRow()));
367     assertEquals(0, rowKV.getQualifier().length);
368   }
369 
370 
371 }
372