View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertNotSame;
25  import static org.junit.Assert.assertTrue;
26  import static org.junit.Assert.fail;
27  
28  import java.io.IOException;
29  import java.util.Arrays;
30  import java.util.HashMap;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Map.Entry;
34  import java.util.Random;
35  import java.util.Set;
36  import java.util.concurrent.Callable;
37  
38  import junit.framework.Assert;
39  
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.conf.Configuration;
43  import org.apache.hadoop.fs.FileStatus;
44  import org.apache.hadoop.fs.FileSystem;
45  import org.apache.hadoop.fs.Path;
46  import org.apache.hadoop.hbase.CategoryBasedTimeout;
47  import org.apache.hadoop.hbase.Cell;
48  import org.apache.hadoop.hbase.CellUtil;
49  import org.apache.hadoop.hbase.CompatibilitySingletonFactory;
50  import org.apache.hadoop.hbase.HBaseConfiguration;
51  import org.apache.hadoop.hbase.HBaseTestingUtility;
52  import org.apache.hadoop.hbase.HColumnDescriptor;
53  import org.apache.hadoop.hbase.HConstants;
54  import org.apache.hadoop.hbase.HTableDescriptor;
55  import org.apache.hadoop.hbase.HadoopShims;
56  import org.apache.hadoop.hbase.KeyValue;
57  import org.apache.hadoop.hbase.PerformanceEvaluation;
58  import org.apache.hadoop.hbase.TableName;
59  import org.apache.hadoop.hbase.client.HBaseAdmin;
60  import org.apache.hadoop.hbase.client.HTable;
61  import org.apache.hadoop.hbase.client.Put;
62  import org.apache.hadoop.hbase.client.RegionLocator;
63  import org.apache.hadoop.hbase.client.Result;
64  import org.apache.hadoop.hbase.client.ResultScanner;
65  import org.apache.hadoop.hbase.client.Scan;
66  import org.apache.hadoop.hbase.client.Table;
67  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
68  import org.apache.hadoop.hbase.io.compress.Compression;
69  import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
70  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
71  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
72  import org.apache.hadoop.hbase.io.hfile.HFile;
73  import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
74  import org.apache.hadoop.hbase.regionserver.BloomType;
75  import org.apache.hadoop.hbase.regionserver.HRegion;
76  import org.apache.hadoop.hbase.regionserver.HStore;
77  import org.apache.hadoop.hbase.regionserver.Store;
78  import org.apache.hadoop.hbase.regionserver.StoreFile;
79  import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
80  import org.apache.hadoop.hbase.testclassification.LargeTests;
81  import org.apache.hadoop.hbase.util.Bytes;
82  import org.apache.hadoop.hbase.util.FSUtils;
83  import org.apache.hadoop.hbase.util.Threads;
84  import org.apache.hadoop.hbase.util.Writables;
85  import org.apache.hadoop.io.NullWritable;
86  import org.apache.hadoop.mapreduce.Job;
87  import org.apache.hadoop.mapreduce.Mapper;
88  import org.apache.hadoop.mapreduce.RecordWriter;
89  import org.apache.hadoop.mapreduce.TaskAttemptContext;
90  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
91  import org.junit.Ignore;
92  import org.junit.Rule;
93  import org.junit.Test;
94  import org.junit.experimental.categories.Category;
95  import org.junit.rules.TestRule;
96  import org.mockito.Mockito;
97  
98  /**
99   * Simple test for {@link KeyValueSortReducer} and {@link HFileOutputFormat}.
100  * Sets up and runs a mapreduce job that writes hfile output.
101  * Creates a few inner classes to implement splits and an inputformat that
102  * emits keys and values like those of {@link PerformanceEvaluation}.
103  */
104 @Category(LargeTests.class)
105 public class TestHFileOutputFormat  {
106   @Rule public final TestRule timeout = CategoryBasedTimeout.builder().
107       withTimeout(this.getClass()).withLookingForStuckThread(true).build();
108   private final static int ROWSPERSPLIT = 1024;
109 
110   private static final byte[] FAMILY_NAME = Bytes.toBytes("info");
111   private static final byte[][] FAMILIES = {
112       Bytes.add(FAMILY_NAME, Bytes.toBytes("-A")),
113       Bytes.add(FAMILY_NAME, Bytes.toBytes("-B"))
114     };
115   private static final TableName TABLE_NAME =
116       TableName.valueOf("TestTable");
117 
118   private HBaseTestingUtility util = new HBaseTestingUtility();
119 
120   private static final Log LOG = LogFactory.getLog(TestHFileOutputFormat.class);
121 
122   /**
123    * Simple mapper that makes KeyValue output.
124    */
125   static class RandomKVGeneratingMapper extends
126       Mapper<NullWritable, NullWritable, ImmutableBytesWritable, KeyValue> {
127 
128     private int keyLength;
129     private static final int KEYLEN_DEFAULT = 10;
130     private static final String KEYLEN_CONF = "randomkv.key.length";
131 
132     private int valLength;
133     private static final int VALLEN_DEFAULT=10;
134     private static final String VALLEN_CONF="randomkv.val.length";
135     private static final byte [] QUALIFIER = Bytes.toBytes("data");
136 
137     @Override
138     protected void setup(Context context) throws IOException,
139         InterruptedException {
140       super.setup(context);
141 
142       Configuration conf = context.getConfiguration();
143       keyLength = conf.getInt(KEYLEN_CONF, KEYLEN_DEFAULT);
144       valLength = conf.getInt(VALLEN_CONF, VALLEN_DEFAULT);
145     }
146 
147     protected void map(
148         NullWritable n1, NullWritable n2,
149         Mapper<NullWritable, NullWritable,
150                ImmutableBytesWritable,KeyValue>.Context context)
151         throws java.io.IOException ,InterruptedException
152     {
153 
154       byte keyBytes[] = new byte[keyLength];
155       byte valBytes[] = new byte[valLength];
156 
157       int taskId = context.getTaskAttemptID().getTaskID().getId();
158       assert taskId < Byte.MAX_VALUE : "Unit tests dont support > 127 tasks!";
159 
160       Random random = new Random();
161       for (int i = 0; i < ROWSPERSPLIT; i++) {
162 
163         random.nextBytes(keyBytes);
164         // Ensure that unique tasks generate unique keys
165         keyBytes[keyLength - 1] = (byte)(taskId & 0xFF);
166         random.nextBytes(valBytes);
167         ImmutableBytesWritable key = new ImmutableBytesWritable(keyBytes);
168 
169         for (byte[] family : TestHFileOutputFormat.FAMILIES) {
170           KeyValue kv = new KeyValue(keyBytes, family, QUALIFIER, valBytes);
171           context.write(key, kv);
172         }
173       }
174     }
175   }
176 
177   private void setupRandomGeneratorMapper(Job job) {
178     job.setInputFormatClass(NMapInputFormat.class);
179     job.setMapperClass(RandomKVGeneratingMapper.class);
180     job.setMapOutputKeyClass(ImmutableBytesWritable.class);
181     job.setMapOutputValueClass(KeyValue.class);
182   }
183 
184   /**
185    * Test that {@link HFileOutputFormat} RecordWriter amends timestamps if
186    * passed a keyvalue whose timestamp is {@link HConstants#LATEST_TIMESTAMP}.
187    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2615">HBASE-2615</a>
188    */
189   @Test
190   public void test_LATEST_TIMESTAMP_isReplaced()
191   throws Exception {
192     Configuration conf = new Configuration(this.util.getConfiguration());
193     RecordWriter<ImmutableBytesWritable, KeyValue> writer = null;
194     TaskAttemptContext context = null;
195     Path dir =
196       util.getDataTestDir("test_LATEST_TIMESTAMP_isReplaced");
197     try {
198       Job job = new Job(conf);
199       FileOutputFormat.setOutputPath(job, dir);
200       context = createTestTaskAttemptContext(job);
201       HFileOutputFormat hof = new HFileOutputFormat();
202       writer = hof.getRecordWriter(context);
203       final byte [] b = Bytes.toBytes("b");
204 
205       // Test 1.  Pass a KV that has a ts of LATEST_TIMESTAMP.  It should be
206       // changed by call to write.  Check all in kv is same but ts.
207       KeyValue kv = new KeyValue(b, b, b);
208       KeyValue original = kv.clone();
209       writer.write(new ImmutableBytesWritable(), kv);
210       assertFalse(original.equals(kv));
211       assertTrue(Bytes.equals(original.getRow(), kv.getRow()));
212       assertTrue(CellUtil.matchingColumn(original, kv.getFamily(), kv.getQualifier()));
213       assertNotSame(original.getTimestamp(), kv.getTimestamp());
214       assertNotSame(HConstants.LATEST_TIMESTAMP, kv.getTimestamp());
215 
216       // Test 2. Now test passing a kv that has explicit ts.  It should not be
217       // changed by call to record write.
218       kv = new KeyValue(b, b, b, kv.getTimestamp() - 1, b);
219       original = kv.clone();
220       writer.write(new ImmutableBytesWritable(), kv);
221       assertTrue(original.equals(kv));
222     } finally {
223       if (writer != null && context != null) writer.close(context);
224       dir.getFileSystem(conf).delete(dir, true);
225     }
226   }
227 
228   private TaskAttemptContext createTestTaskAttemptContext(final Job job)
229   throws IOException, Exception {
230     HadoopShims hadoop = CompatibilitySingletonFactory.getInstance(HadoopShims.class);
231     TaskAttemptContext context = hadoop.createTestTaskAttemptContext(job, "attempt_200707121733_0001_m_000000_0");
232     return context;
233   }
234 
235   /*
236    * Test that {@link HFileOutputFormat} creates an HFile with TIMERANGE
237    * metadata used by time-restricted scans.
238    */
239   @Test
240   public void test_TIMERANGE() throws Exception {
241     Configuration conf = new Configuration(this.util.getConfiguration());
242     RecordWriter<ImmutableBytesWritable, KeyValue> writer = null;
243     TaskAttemptContext context = null;
244     Path dir =
245       util.getDataTestDir("test_TIMERANGE_present");
246     LOG.info("Timerange dir writing to dir: "+ dir);
247     try {
248       // build a record writer using HFileOutputFormat
249       Job job = new Job(conf);
250       FileOutputFormat.setOutputPath(job, dir);
251       context = createTestTaskAttemptContext(job);
252       HFileOutputFormat hof = new HFileOutputFormat();
253       writer = hof.getRecordWriter(context);
254 
255       // Pass two key values with explicit times stamps
256       final byte [] b = Bytes.toBytes("b");
257 
258       // value 1 with timestamp 2000
259       KeyValue kv = new KeyValue(b, b, b, 2000, b);
260       KeyValue original = kv.clone();
261       writer.write(new ImmutableBytesWritable(), kv);
262       assertEquals(original,kv);
263 
264       // value 2 with timestamp 1000
265       kv = new KeyValue(b, b, b, 1000, b);
266       original = kv.clone();
267       writer.write(new ImmutableBytesWritable(), kv);
268       assertEquals(original, kv);
269 
270       // verify that the file has the proper FileInfo.
271       writer.close(context);
272 
273       // the generated file lives 1 directory down from the attempt directory
274       // and is the only file, e.g.
275       // _attempt__0000_r_000000_0/b/1979617994050536795
276       FileSystem fs = FileSystem.get(conf);
277       Path attemptDirectory = hof.getDefaultWorkFile(context, "").getParent();
278       FileStatus[] sub1 = fs.listStatus(attemptDirectory);
279       FileStatus[] file = fs.listStatus(sub1[0].getPath());
280 
281       // open as HFile Reader and pull out TIMERANGE FileInfo.
282       HFile.Reader rd = HFile.createReader(fs, file[0].getPath(),
283           new CacheConfig(conf), conf);
284       Map<byte[],byte[]> finfo = rd.loadFileInfo();
285       byte[] range = finfo.get("TIMERANGE".getBytes());
286       assertNotNull(range);
287 
288       // unmarshall and check values.
289       TimeRangeTracker timeRangeTracker = new TimeRangeTracker();
290       Writables.copyWritable(range, timeRangeTracker);
291       LOG.info(timeRangeTracker.getMin() + "...." + timeRangeTracker.getMax());
292       assertEquals(1000, timeRangeTracker.getMin());
293       assertEquals(2000, timeRangeTracker.getMax());
294       rd.close();
295     } finally {
296       if (writer != null && context != null) writer.close(context);
297       dir.getFileSystem(conf).delete(dir, true);
298     }
299   }
300 
301   /**
302    * Run small MR job.
303    */
304   @Test
305   public void testWritingPEData() throws Exception {
306     Configuration conf = util.getConfiguration();
307     Path testDir = util.getDataTestDirOnTestFS("testWritingPEData");
308     FileSystem fs = testDir.getFileSystem(conf);
309 
310     // Set down this value or we OOME in eclipse.
311     conf.setInt("mapreduce.task.io.sort.mb", 20);
312     // Write a few files.
313     conf.setLong(HConstants.HREGION_MAX_FILESIZE, 64 * 1024);
314 
315     Job job = new Job(conf, "testWritingPEData");
316     setupRandomGeneratorMapper(job);
317     // This partitioner doesn't work well for number keys but using it anyways
318     // just to demonstrate how to configure it.
319     byte[] startKey = new byte[RandomKVGeneratingMapper.KEYLEN_DEFAULT];
320     byte[] endKey = new byte[RandomKVGeneratingMapper.KEYLEN_DEFAULT];
321 
322     Arrays.fill(startKey, (byte)0);
323     Arrays.fill(endKey, (byte)0xff);
324 
325     job.setPartitionerClass(SimpleTotalOrderPartitioner.class);
326     // Set start and end rows for partitioner.
327     SimpleTotalOrderPartitioner.setStartKey(job.getConfiguration(), startKey);
328     SimpleTotalOrderPartitioner.setEndKey(job.getConfiguration(), endKey);
329     job.setReducerClass(KeyValueSortReducer.class);
330     job.setOutputFormatClass(HFileOutputFormat.class);
331     job.setNumReduceTasks(4);
332     job.getConfiguration().setStrings("io.serializations", conf.get("io.serializations"),
333         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
334         KeyValueSerialization.class.getName());
335 
336     FileOutputFormat.setOutputPath(job, testDir);
337     assertTrue(job.waitForCompletion(false));
338     FileStatus [] files = fs.listStatus(testDir);
339     assertTrue(files.length > 0);
340   }
341 
342   @Test
343   public void testJobConfiguration() throws Exception {
344     Configuration conf = new Configuration(this.util.getConfiguration());
345     conf.set("hbase.fs.tmp.dir", util.getDataTestDir("testJobConfiguration").toString());
346     Job job = new Job(conf);
347     job.setWorkingDirectory(util.getDataTestDir("testJobConfiguration"));
348     HTableDescriptor tableDescriptor = Mockito.mock(HTableDescriptor.class);
349     RegionLocator regionLocator = Mockito.mock(RegionLocator.class);
350     setupMockStartKeys(regionLocator);
351     setupMockTableName(regionLocator);
352     HFileOutputFormat2.configureIncrementalLoad(job, tableDescriptor, regionLocator);
353     assertEquals(job.getNumReduceTasks(), 4);
354   }
355 
356   private byte [][] generateRandomStartKeys(int numKeys) {
357     Random random = new Random();
358     byte[][] ret = new byte[numKeys][];
359     // first region start key is always empty
360     ret[0] = HConstants.EMPTY_BYTE_ARRAY;
361     for (int i = 1; i < numKeys; i++) {
362       ret[i] =
363         PerformanceEvaluation.generateData(random, PerformanceEvaluation.DEFAULT_VALUE_LENGTH);
364     }
365     return ret;
366   }
367 
368   private byte[][] generateRandomSplitKeys(int numKeys) {
369     Random random = new Random();
370     byte[][] ret = new byte[numKeys][];
371     for (int i = 0; i < numKeys; i++) {
372       ret[i] =
373           PerformanceEvaluation.generateData(random, PerformanceEvaluation.DEFAULT_VALUE_LENGTH);
374     }
375     return ret;
376   }
377 
378   @Test
379   public void testMRIncrementalLoad() throws Exception {
380     LOG.info("\nStarting test testMRIncrementalLoad\n");
381     doIncrementalLoadTest(false);
382   }
383 
384   @Test
385   public void testMRIncrementalLoadWithSplit() throws Exception {
386     LOG.info("\nStarting test testMRIncrementalLoadWithSplit\n");
387     doIncrementalLoadTest(true);
388   }
389 
390   private void doIncrementalLoadTest(
391       boolean shouldChangeRegions) throws Exception {
392     util = new HBaseTestingUtility();
393     Configuration conf = util.getConfiguration();
394     byte[][] splitKeys = generateRandomSplitKeys(4);
395     HBaseAdmin admin = null;
396     try {
397       util.setJobWithoutMRCluster();
398       util.startMiniCluster();
399       Path testDir = util.getDataTestDirOnTestFS("testLocalMRIncrementalLoad");
400       admin = util.getHBaseAdmin();
401       HTable table = util.createTable(TABLE_NAME, FAMILIES, splitKeys);
402       assertEquals("Should start with empty table",
403           0, util.countRows(table));
404       int numRegions = -1;
405       try(RegionLocator r = table.getRegionLocator()) {
406         numRegions = r.getStartKeys().length;
407       }
408       assertEquals("Should make 5 regions", numRegions, 5);
409 
410       // Generate the bulk load files
411       runIncrementalPELoad(conf, table, testDir);
412       // This doesn't write into the table, just makes files
413       assertEquals("HFOF should not touch actual table",
414           0, util.countRows(table));
415 
416 
417       // Make sure that a directory was created for every CF
418       int dir = 0;
419       for (FileStatus f : testDir.getFileSystem(conf).listStatus(testDir)) {
420         for (byte[] family : FAMILIES) {
421           if (Bytes.toString(family).equals(f.getPath().getName())) {
422             ++dir;
423           }
424         }
425       }
426       assertEquals("Column family not found in FS.", FAMILIES.length, dir);
427 
428       // handle the split case
429       if (shouldChangeRegions) {
430         LOG.info("Changing regions in table");
431         admin.disableTable(table.getTableName());
432         while(util.getMiniHBaseCluster().getMaster().getAssignmentManager().
433             getRegionStates().isRegionsInTransition()) {
434           Threads.sleep(200);
435           LOG.info("Waiting on table to finish disabling");
436         }
437         util.deleteTable(table.getName());
438         byte[][] newSplitKeys = generateRandomSplitKeys(14);
439         table = util.createTable(TABLE_NAME, FAMILIES, newSplitKeys);
440         while (table.getRegionLocations().size() != 15 ||
441             !admin.isTableAvailable(table.getTableName())) {
442           Thread.sleep(200);
443           LOG.info("Waiting for new region assignment to happen");
444         }
445       }
446 
447       // Perform the actual load
448       new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table);
449 
450       // Ensure data shows up
451       int expectedRows = NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT;
452       assertEquals("LoadIncrementalHFiles should put expected data in table",
453           expectedRows, util.countRows(table));
454       Scan scan = new Scan();
455       ResultScanner results = table.getScanner(scan);
456       for (Result res : results) {
457         assertEquals(FAMILIES.length, res.rawCells().length);
458         Cell first = res.rawCells()[0];
459         for (Cell kv : res.rawCells()) {
460           assertTrue(CellUtil.matchingRow(first, kv));
461           assertTrue(Bytes.equals(CellUtil.cloneValue(first), CellUtil.cloneValue(kv)));
462         }
463       }
464       results.close();
465       String tableDigestBefore = util.checksumRows(table);
466 
467       // Cause regions to reopen
468       admin.disableTable(TABLE_NAME);
469       while (!admin.isTableDisabled(TABLE_NAME)) {
470         Thread.sleep(200);
471         LOG.info("Waiting for table to disable");
472       }
473       admin.enableTable(TABLE_NAME);
474       util.waitTableAvailable(TABLE_NAME);
475       assertEquals("Data should remain after reopening of regions",
476           tableDigestBefore, util.checksumRows(table));
477     } finally {
478       if (admin != null) admin.close();
479       util.shutdownMiniCluster();
480     }
481   }
482 
483   private void runIncrementalPELoad(
484       Configuration conf, HTable table, Path outDir)
485   throws Exception {
486     Job job = new Job(conf, "testLocalMRIncrementalLoad");
487     job.setWorkingDirectory(util.getDataTestDirOnTestFS("runIncrementalPELoad"));
488     job.getConfiguration().setStrings("io.serializations", conf.get("io.serializations"),
489         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
490         KeyValueSerialization.class.getName());
491     setupRandomGeneratorMapper(job);
492     HFileOutputFormat2.configureIncrementalLoad(job, table.getTableDescriptor(),
493         table.getRegionLocator());
494     FileOutputFormat.setOutputPath(job, outDir);
495 
496     Assert.assertFalse( util.getTestFileSystem().exists(outDir)) ;
497 
498     assertEquals(table.getRegionLocator().getAllRegionLocations().size(), job.getNumReduceTasks());
499 
500     assertTrue(job.waitForCompletion(true));
501   }
502 
503   /**
504    * Test for {@link HFileOutputFormat#configureCompression(org.apache.hadoop.hbase.client.Table,
505    * Configuration)} and {@link HFileOutputFormat#createFamilyCompressionMap
506    * (Configuration)}.
507    * Tests that the compression map is correctly serialized into
508    * and deserialized from configuration
509    *
510    * @throws IOException
511    */
512   @Test
513   public void testSerializeDeserializeFamilyCompressionMap() throws IOException {
514     for (int numCfs = 0; numCfs <= 3; numCfs++) {
515       Configuration conf = new Configuration(this.util.getConfiguration());
516       Map<String, Compression.Algorithm> familyToCompression =
517           getMockColumnFamiliesForCompression(numCfs);
518       Table table = Mockito.mock(HTable.class);
519       setupMockColumnFamiliesForCompression(table, familyToCompression);
520       HFileOutputFormat.configureCompression(table, conf);
521 
522       // read back family specific compression setting from the configuration
523       Map<byte[], Algorithm> retrievedFamilyToCompressionMap = HFileOutputFormat
524           .createFamilyCompressionMap(conf);
525 
526       // test that we have a value for all column families that matches with the
527       // used mock values
528       for (Entry<String, Algorithm> entry : familyToCompression.entrySet()) {
529         assertEquals("Compression configuration incorrect for column family:"
530             + entry.getKey(), entry.getValue(),
531             retrievedFamilyToCompressionMap.get(entry.getKey().getBytes()));
532       }
533     }
534   }
535 
536   private void setupMockColumnFamiliesForCompression(Table table,
537       Map<String, Compression.Algorithm> familyToCompression) throws IOException {
538     HTableDescriptor mockTableDescriptor = new HTableDescriptor(TABLE_NAME);
539     for (Entry<String, Compression.Algorithm> entry : familyToCompression.entrySet()) {
540       mockTableDescriptor.addFamily(new HColumnDescriptor(entry.getKey())
541           .setMaxVersions(1)
542           .setCompressionType(entry.getValue())
543           .setBlockCacheEnabled(false)
544           .setTimeToLive(0));
545     }
546     Mockito.doReturn(mockTableDescriptor).when(table).getTableDescriptor();
547   }
548 
549   /**
550    * @return a map from column family names to compression algorithms for
551    *         testing column family compression. Column family names have special characters
552    */
553   private Map<String, Compression.Algorithm>
554       getMockColumnFamiliesForCompression (int numCfs) {
555     Map<String, Compression.Algorithm> familyToCompression = new HashMap<String, Compression.Algorithm>();
556     // use column family names having special characters
557     if (numCfs-- > 0) {
558       familyToCompression.put("Family1!@#!@#&", Compression.Algorithm.LZO);
559     }
560     if (numCfs-- > 0) {
561       familyToCompression.put("Family2=asdads&!AASD", Compression.Algorithm.SNAPPY);
562     }
563     if (numCfs-- > 0) {
564       familyToCompression.put("Family2=asdads&!AASD", Compression.Algorithm.GZ);
565     }
566     if (numCfs-- > 0) {
567       familyToCompression.put("Family3", Compression.Algorithm.NONE);
568     }
569     return familyToCompression;
570   }
571 
572 
573   /**
574    * Test for {@link HFileOutputFormat#configureBloomType(org.apache.hadoop.hbase.client.Table,
575    * Configuration)} and {@link HFileOutputFormat#createFamilyBloomTypeMap
576    * (Configuration)}.
577    * Tests that the compression map is correctly serialized into
578    * and deserialized from configuration
579    *
580    * @throws IOException
581    */
582   @Test
583   public void testSerializeDeserializeFamilyBloomTypeMap() throws IOException {
584     for (int numCfs = 0; numCfs <= 2; numCfs++) {
585       Configuration conf = new Configuration(this.util.getConfiguration());
586       Map<String, BloomType> familyToBloomType =
587           getMockColumnFamiliesForBloomType(numCfs);
588       Table table = Mockito.mock(HTable.class);
589       setupMockColumnFamiliesForBloomType(table,
590           familyToBloomType);
591       HFileOutputFormat.configureBloomType(table, conf);
592 
593       // read back family specific data block encoding settings from the
594       // configuration
595       Map<byte[], BloomType> retrievedFamilyToBloomTypeMap =
596           HFileOutputFormat
597               .createFamilyBloomTypeMap(conf);
598 
599       // test that we have a value for all column families that matches with the
600       // used mock values
601       for (Entry<String, BloomType> entry : familyToBloomType.entrySet()) {
602         assertEquals("BloomType configuration incorrect for column family:"
603             + entry.getKey(), entry.getValue(),
604             retrievedFamilyToBloomTypeMap.get(entry.getKey().getBytes()));
605       }
606     }
607   }
608 
609   private void setupMockColumnFamiliesForBloomType(Table table,
610       Map<String, BloomType> familyToDataBlockEncoding) throws IOException {
611     HTableDescriptor mockTableDescriptor = new HTableDescriptor(TABLE_NAME);
612     for (Entry<String, BloomType> entry : familyToDataBlockEncoding.entrySet()) {
613       mockTableDescriptor.addFamily(new HColumnDescriptor(entry.getKey())
614           .setMaxVersions(1)
615           .setBloomFilterType(entry.getValue())
616           .setBlockCacheEnabled(false)
617           .setTimeToLive(0));
618     }
619     Mockito.doReturn(mockTableDescriptor).when(table).getTableDescriptor();
620   }
621 
622   /**
623    * @return a map from column family names to compression algorithms for
624    *         testing column family compression. Column family names have special characters
625    */
626   private Map<String, BloomType>
627   getMockColumnFamiliesForBloomType (int numCfs) {
628     Map<String, BloomType> familyToBloomType =
629         new HashMap<String, BloomType>();
630     // use column family names having special characters
631     if (numCfs-- > 0) {
632       familyToBloomType.put("Family1!@#!@#&", BloomType.ROW);
633     }
634     if (numCfs-- > 0) {
635       familyToBloomType.put("Family2=asdads&!AASD",
636           BloomType.ROWCOL);
637     }
638     if (numCfs-- > 0) {
639       familyToBloomType.put("Family3", BloomType.NONE);
640     }
641     return familyToBloomType;
642   }
643 
644   /**
645    * Test for {@link HFileOutputFormat#configureBlockSize(org.apache.hadoop.hbase.client.Table,
646    * Configuration)} and {@link HFileOutputFormat#createFamilyBlockSizeMap
647    * (Configuration)}.
648    * Tests that the compression map is correctly serialized into
649    * and deserialized from configuration
650    *
651    * @throws IOException
652    */
653   @Test
654   public void testSerializeDeserializeFamilyBlockSizeMap() throws IOException {
655     for (int numCfs = 0; numCfs <= 3; numCfs++) {
656       Configuration conf = new Configuration(this.util.getConfiguration());
657       Map<String, Integer> familyToBlockSize =
658           getMockColumnFamiliesForBlockSize(numCfs);
659       Table table = Mockito.mock(HTable.class);
660       setupMockColumnFamiliesForBlockSize(table,
661           familyToBlockSize);
662       HFileOutputFormat.configureBlockSize(table, conf);
663 
664       // read back family specific data block encoding settings from the
665       // configuration
666       Map<byte[], Integer> retrievedFamilyToBlockSizeMap =
667           HFileOutputFormat
668               .createFamilyBlockSizeMap(conf);
669 
670       // test that we have a value for all column families that matches with the
671       // used mock values
672       for (Entry<String, Integer> entry : familyToBlockSize.entrySet()
673           ) {
674         assertEquals("BlockSize configuration incorrect for column family:"
675             + entry.getKey(), entry.getValue(),
676             retrievedFamilyToBlockSizeMap.get(entry.getKey().getBytes()));
677       }
678     }
679   }
680 
681   private void setupMockColumnFamiliesForBlockSize(Table table,
682       Map<String, Integer> familyToDataBlockEncoding) throws IOException {
683     HTableDescriptor mockTableDescriptor = new HTableDescriptor(TABLE_NAME);
684     for (Entry<String, Integer> entry : familyToDataBlockEncoding.entrySet()) {
685       mockTableDescriptor.addFamily(new HColumnDescriptor(entry.getKey())
686           .setMaxVersions(1)
687           .setBlocksize(entry.getValue())
688           .setBlockCacheEnabled(false)
689           .setTimeToLive(0));
690     }
691     Mockito.doReturn(mockTableDescriptor).when(table).getTableDescriptor();
692   }
693 
694   /**
695    * @return a map from column family names to compression algorithms for
696    *         testing column family compression. Column family names have special characters
697    */
698   private Map<String, Integer>
699   getMockColumnFamiliesForBlockSize (int numCfs) {
700     Map<String, Integer> familyToBlockSize =
701         new HashMap<String, Integer>();
702     // use column family names having special characters
703     if (numCfs-- > 0) {
704       familyToBlockSize.put("Family1!@#!@#&", 1234);
705     }
706     if (numCfs-- > 0) {
707       familyToBlockSize.put("Family2=asdads&!AASD",
708           Integer.MAX_VALUE);
709     }
710     if (numCfs-- > 0) {
711       familyToBlockSize.put("Family2=asdads&!AASD",
712           Integer.MAX_VALUE);
713     }
714     if (numCfs-- > 0) {
715       familyToBlockSize.put("Family3", 0);
716     }
717     return familyToBlockSize;
718   }
719 
720     /**
721    * Test for {@link HFileOutputFormat#configureDataBlockEncoding(org.apache.hadoop.hbase.client.Table,
722    * Configuration)} and {@link HFileOutputFormat#createFamilyDataBlockEncodingMap
723    * (Configuration)}.
724    * Tests that the compression map is correctly serialized into
725    * and deserialized from configuration
726    *
727    * @throws IOException
728    */
729   @Test
730   public void testSerializeDeserializeFamilyDataBlockEncodingMap() throws IOException {
731     for (int numCfs = 0; numCfs <= 3; numCfs++) {
732       Configuration conf = new Configuration(this.util.getConfiguration());
733       Map<String, DataBlockEncoding> familyToDataBlockEncoding =
734           getMockColumnFamiliesForDataBlockEncoding(numCfs);
735       Table table = Mockito.mock(HTable.class);
736       setupMockColumnFamiliesForDataBlockEncoding(table,
737           familyToDataBlockEncoding);
738       HFileOutputFormat.configureDataBlockEncoding(table, conf);
739 
740       // read back family specific data block encoding settings from the
741       // configuration
742       Map<byte[], DataBlockEncoding> retrievedFamilyToDataBlockEncodingMap =
743           HFileOutputFormat
744           .createFamilyDataBlockEncodingMap(conf);
745 
746       // test that we have a value for all column families that matches with the
747       // used mock values
748       for (Entry<String, DataBlockEncoding> entry : familyToDataBlockEncoding.entrySet()) {
749         assertEquals("DataBlockEncoding configuration incorrect for column family:"
750             + entry.getKey(), entry.getValue(),
751             retrievedFamilyToDataBlockEncodingMap.get(entry.getKey().getBytes()));
752       }
753     }
754   }
755 
756   private void setupMockColumnFamiliesForDataBlockEncoding(Table table,
757       Map<String, DataBlockEncoding> familyToDataBlockEncoding) throws IOException {
758     HTableDescriptor mockTableDescriptor = new HTableDescriptor(TABLE_NAME);
759     for (Entry<String, DataBlockEncoding> entry : familyToDataBlockEncoding.entrySet()) {
760       mockTableDescriptor.addFamily(new HColumnDescriptor(entry.getKey())
761           .setMaxVersions(1)
762           .setDataBlockEncoding(entry.getValue())
763           .setBlockCacheEnabled(false)
764           .setTimeToLive(0));
765     }
766     Mockito.doReturn(mockTableDescriptor).when(table).getTableDescriptor();
767   }
768 
769   /**
770    * @return a map from column family names to compression algorithms for
771    *         testing column family compression. Column family names have special characters
772    */
773   private Map<String, DataBlockEncoding>
774       getMockColumnFamiliesForDataBlockEncoding (int numCfs) {
775     Map<String, DataBlockEncoding> familyToDataBlockEncoding =
776         new HashMap<String, DataBlockEncoding>();
777     // use column family names having special characters
778     if (numCfs-- > 0) {
779       familyToDataBlockEncoding.put("Family1!@#!@#&", DataBlockEncoding.DIFF);
780     }
781     if (numCfs-- > 0) {
782       familyToDataBlockEncoding.put("Family2=asdads&!AASD",
783           DataBlockEncoding.FAST_DIFF);
784     }
785     if (numCfs-- > 0) {
786       familyToDataBlockEncoding.put("Family2=asdads&!AASD",
787           DataBlockEncoding.PREFIX);
788     }
789     if (numCfs-- > 0) {
790       familyToDataBlockEncoding.put("Family3", DataBlockEncoding.NONE);
791     }
792     return familyToDataBlockEncoding;
793   }
794 
795   private void setupMockStartKeys(RegionLocator regionLocator) throws IOException {
796     byte[][] mockKeys = new byte[][] {
797         HConstants.EMPTY_BYTE_ARRAY,
798         Bytes.toBytes("aaa"),
799         Bytes.toBytes("ggg"),
800         Bytes.toBytes("zzz")
801     };
802     Mockito.doReturn(mockKeys).when(regionLocator).getStartKeys();
803   }
804 
805   private void setupMockTableName(RegionLocator table) throws IOException {
806     TableName mockTableName = TableName.valueOf("mock_table");
807     Mockito.doReturn(mockTableName).when(table).getName();
808   }
809 
810   /**
811    * Test that {@link HFileOutputFormat} RecordWriter uses compression and
812    * bloom filter settings from the column family descriptor
813    */
814   @Test
815   public void testColumnFamilySettings() throws Exception {
816     Configuration conf = new Configuration(this.util.getConfiguration());
817     RecordWriter<ImmutableBytesWritable, KeyValue> writer = null;
818     TaskAttemptContext context = null;
819     Path dir = util.getDataTestDir("testColumnFamilySettings");
820 
821     // Setup table descriptor
822     HTable table = Mockito.mock(HTable.class);
823     RegionLocator regionLocator = Mockito.mock(RegionLocator.class);
824     HTableDescriptor htd = new HTableDescriptor(TABLE_NAME);
825     Mockito.doReturn(htd).when(table).getTableDescriptor();
826     for (HColumnDescriptor hcd: this.util.generateColumnDescriptors()) {
827       htd.addFamily(hcd);
828     }
829 
830     // set up the table to return some mock keys
831     setupMockStartKeys(regionLocator);
832 
833     try {
834       // partial map red setup to get an operational writer for testing
835       // We turn off the sequence file compression, because DefaultCodec
836       // pollutes the GZip codec pool with an incompatible compressor.
837       conf.set("io.seqfile.compression.type", "NONE");
838       conf.set("hbase.fs.tmp.dir", dir.toString());
839       // turn locality off to eliminate getRegionLocation fail-and-retry time when writing kvs
840       conf.setBoolean(HFileOutputFormat2.LOCALITY_SENSITIVE_CONF_KEY, false);
841       Job job = new Job(conf, "testLocalMRIncrementalLoad");
842       job.setWorkingDirectory(util.getDataTestDirOnTestFS("testColumnFamilySettings"));
843       setupRandomGeneratorMapper(job);
844       HFileOutputFormat2.configureIncrementalLoad(job, table.getTableDescriptor(), regionLocator);
845       FileOutputFormat.setOutputPath(job, dir);
846       context = createTestTaskAttemptContext(job);
847       HFileOutputFormat hof = new HFileOutputFormat();
848       writer = hof.getRecordWriter(context);
849 
850       // write out random rows
851       writeRandomKeyValues(writer, context, htd.getFamiliesKeys(), ROWSPERSPLIT);
852       writer.close(context);
853 
854       // Make sure that a directory was created for every CF
855       FileSystem fs = dir.getFileSystem(conf);
856 
857       // commit so that the filesystem has one directory per column family
858       hof.getOutputCommitter(context).commitTask(context);
859       hof.getOutputCommitter(context).commitJob(context);
860       FileStatus[] families = FSUtils.listStatus(fs, dir, new FSUtils.FamilyDirFilter(fs));
861       assertEquals(htd.getFamilies().size(), families.length);
862       for (FileStatus f : families) {
863         String familyStr = f.getPath().getName();
864         HColumnDescriptor hcd = htd.getFamily(Bytes.toBytes(familyStr));
865         // verify that the compression on this file matches the configured
866         // compression
867         Path dataFilePath = fs.listStatus(f.getPath())[0].getPath();
868         Reader reader = HFile.createReader(fs, dataFilePath, new CacheConfig(conf), conf);
869         Map<byte[], byte[]> fileInfo = reader.loadFileInfo();
870 
871         byte[] bloomFilter = fileInfo.get(StoreFile.BLOOM_FILTER_TYPE_KEY);
872         if (bloomFilter == null) bloomFilter = Bytes.toBytes("NONE");
873         assertEquals("Incorrect bloom filter used for column family " + familyStr +
874           "(reader: " + reader + ")",
875           hcd.getBloomFilterType(), BloomType.valueOf(Bytes.toString(bloomFilter)));
876         assertEquals("Incorrect compression used for column family " + familyStr +
877           "(reader: " + reader + ")", hcd.getCompression(), reader.getFileContext().getCompression());
878       }
879     } finally {
880       dir.getFileSystem(conf).delete(dir, true);
881     }
882   }
883 
884   /**
885    * Write random values to the writer assuming a table created using
886    * {@link #FAMILIES} as column family descriptors
887    */
888   private void writeRandomKeyValues(RecordWriter<ImmutableBytesWritable, KeyValue> writer,
889       TaskAttemptContext context, Set<byte[]> families, int numRows)
890       throws IOException, InterruptedException {
891     byte keyBytes[] = new byte[Bytes.SIZEOF_INT];
892     int valLength = 10;
893     byte valBytes[] = new byte[valLength];
894 
895     int taskId = context.getTaskAttemptID().getTaskID().getId();
896     assert taskId < Byte.MAX_VALUE : "Unit tests dont support > 127 tasks!";
897     final byte [] qualifier = Bytes.toBytes("data");
898     Random random = new Random();
899     for (int i = 0; i < numRows; i++) {
900 
901       Bytes.putInt(keyBytes, 0, i);
902       random.nextBytes(valBytes);
903       ImmutableBytesWritable key = new ImmutableBytesWritable(keyBytes);
904 
905       for (byte[] family : families) {
906         KeyValue kv = new KeyValue(keyBytes, family, qualifier, valBytes);
907         writer.write(key, kv);
908       }
909     }
910   }
911 
912   /**
913    * This test is to test the scenario happened in HBASE-6901.
914    * All files are bulk loaded and excluded from minor compaction.
915    * Without the fix of HBASE-6901, an ArrayIndexOutOfBoundsException
916    * will be thrown.
917    */
918   @Ignore ("Flakey: See HBASE-9051") @Test
919   public void testExcludeAllFromMinorCompaction() throws Exception {
920     Configuration conf = util.getConfiguration();
921     conf.setInt("hbase.hstore.compaction.min", 2);
922     generateRandomStartKeys(5);
923 
924     try {
925       util.setJobWithoutMRCluster();
926       util.startMiniCluster();
927       final FileSystem fs = util.getDFSCluster().getFileSystem();
928       HBaseAdmin admin = new HBaseAdmin(conf);
929       HTable table = util.createTable(TABLE_NAME, FAMILIES);
930       assertEquals("Should start with empty table", 0, util.countRows(table));
931 
932       // deep inspection: get the StoreFile dir
933       final Path storePath = HStore.getStoreHomedir(
934           FSUtils.getTableDir(FSUtils.getRootDir(conf), TABLE_NAME),
935           admin.getTableRegions(TABLE_NAME).get(0),
936           FAMILIES[0]);
937       assertEquals(0, fs.listStatus(storePath).length);
938 
939       // Generate two bulk load files
940       conf.setBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude",
941           true);
942 
943       for (int i = 0; i < 2; i++) {
944         Path testDir = util.getDataTestDirOnTestFS("testExcludeAllFromMinorCompaction_" + i);
945         runIncrementalPELoad(conf, table, testDir);
946         // Perform the actual load
947         new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table);
948       }
949 
950       // Ensure data shows up
951       int expectedRows = 2 * NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT;
952       assertEquals("LoadIncrementalHFiles should put expected data in table",
953           expectedRows, util.countRows(table));
954 
955       // should have a second StoreFile now
956       assertEquals(2, fs.listStatus(storePath).length);
957 
958       // minor compactions shouldn't get rid of the file
959       admin.compact(TABLE_NAME.getName());
960       try {
961         quickPoll(new Callable<Boolean>() {
962           public Boolean call() throws Exception {
963             return fs.listStatus(storePath).length == 1;
964           }
965         }, 5000);
966         throw new IOException("SF# = " + fs.listStatus(storePath).length);
967       } catch (AssertionError ae) {
968         // this is expected behavior
969       }
970 
971       // a major compaction should work though
972       admin.majorCompact(TABLE_NAME.getName());
973       quickPoll(new Callable<Boolean>() {
974         public Boolean call() throws Exception {
975           return fs.listStatus(storePath).length == 1;
976         }
977       }, 5000);
978 
979     } finally {
980       util.shutdownMiniCluster();
981     }
982   }
983 
984   @Test
985   public void testExcludeMinorCompaction() throws Exception {
986     Configuration conf = util.getConfiguration();
987     conf.setInt("hbase.hstore.compaction.min", 2);
988     generateRandomStartKeys(5);
989 
990     try {
991       util.setJobWithoutMRCluster();
992       util.startMiniCluster();
993       Path testDir = util.getDataTestDirOnTestFS("testExcludeMinorCompaction");
994       final FileSystem fs = util.getTestFileSystem();
995       HBaseAdmin admin = new HBaseAdmin(conf);
996       HTable table = util.createTable(TABLE_NAME, FAMILIES);
997       assertEquals("Should start with empty table", 0, util.countRows(table));
998 
999       // deep inspection: get the StoreFile dir
1000       final Path storePath = HStore.getStoreHomedir(
1001           FSUtils.getTableDir(FSUtils.getRootDir(conf), TABLE_NAME),
1002           admin.getTableRegions(TABLE_NAME).get(0),
1003           FAMILIES[0]);
1004       assertEquals(0, fs.listStatus(storePath).length);
1005 
1006       // put some data in it and flush to create a storefile
1007       Put p = new Put(Bytes.toBytes("test"));
1008       p.add(FAMILIES[0], Bytes.toBytes("1"), Bytes.toBytes("1"));
1009       table.put(p);
1010       admin.flush(TABLE_NAME.getName());
1011       assertEquals(1, util.countRows(table));
1012       quickPoll(new Callable<Boolean>() {
1013         public Boolean call() throws Exception {
1014           return fs.listStatus(storePath).length == 1;
1015         }
1016       }, 5000);
1017 
1018       // Generate a bulk load file with more rows
1019       conf.setBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude",
1020           true);
1021       runIncrementalPELoad(conf, table, testDir);
1022 
1023       // Perform the actual load
1024       new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table);
1025 
1026       // Ensure data shows up
1027       int expectedRows = NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT;
1028       assertEquals("LoadIncrementalHFiles should put expected data in table",
1029           expectedRows + 1, util.countRows(table));
1030 
1031       // should have a second StoreFile now
1032       assertEquals(2, fs.listStatus(storePath).length);
1033 
1034       // minor compactions shouldn't get rid of the file
1035       admin.compact(TABLE_NAME.getName());
1036       try {
1037         quickPoll(new Callable<Boolean>() {
1038           public Boolean call() throws Exception {
1039             List<HRegion> regions = util.getMiniHBaseCluster().getRegions(TABLE_NAME);
1040             for (HRegion region : regions) {
1041               for (Store store : region.getStores()) {
1042                 store.closeAndArchiveCompactedFiles();
1043               }
1044             }
1045             return fs.listStatus(storePath).length == 1;
1046           }
1047         }, 5000);
1048         throw new IOException("SF# = " + fs.listStatus(storePath).length);
1049       } catch (AssertionError ae) {
1050         // this is expected behavior
1051       }
1052 
1053       // a major compaction should work though
1054       admin.majorCompact(TABLE_NAME.getName());
1055       quickPoll(new Callable<Boolean>() {
1056         public Boolean call() throws Exception {
1057           List<HRegion> regions = util.getMiniHBaseCluster().getRegions(TABLE_NAME);
1058           for (HRegion region : regions) {
1059             for (Store store : region.getStores()) {
1060               store.closeAndArchiveCompactedFiles();
1061             }
1062           }
1063           return fs.listStatus(storePath).length == 1;
1064         }
1065       }, 5000);
1066 
1067     } finally {
1068       util.shutdownMiniCluster();
1069     }
1070   }
1071 
1072   private void quickPoll(Callable<Boolean> c, int waitMs) throws Exception {
1073     int sleepMs = 10;
1074     int retries = (int) Math.ceil(((double) waitMs) / sleepMs);
1075     while (retries-- > 0) {
1076       if (c.call().booleanValue()) {
1077         return;
1078       }
1079       Thread.sleep(sleepMs);
1080     }
1081     fail();
1082   }
1083 
1084   public static void main(String args[]) throws Exception {
1085     new TestHFileOutputFormat().manualTest(args);
1086   }
1087 
1088   public void manualTest(String args[]) throws Exception {
1089     Configuration conf = HBaseConfiguration.create();
1090     util = new HBaseTestingUtility(conf);
1091     if ("newtable".equals(args[0])) {
1092       TableName tname = TableName.valueOf(args[1]);
1093       byte[][] splitKeys = generateRandomSplitKeys(4);
1094       HTable table = util.createTable(tname, FAMILIES, splitKeys);
1095     } else if ("incremental".equals(args[0])) {
1096       TableName tname = TableName.valueOf(args[1]);
1097       HTable table = new HTable(conf, tname);
1098       Path outDir = new Path("incremental-out");
1099       runIncrementalPELoad(conf, table, outDir);
1100     } else {
1101       throw new RuntimeException(
1102           "usage: TestHFileOutputFormat newtable | incremental");
1103     }
1104   }
1105 
1106 }
1107