1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import static org.junit.Assert.assertEquals;
22 import static org.junit.Assert.assertFalse;
23 import static org.junit.Assert.assertTrue;
24
25 import java.io.IOException;
26 import java.util.Arrays;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.Set;
33 import java.util.UUID;
34
35 import org.apache.commons.logging.Log;
36 import org.apache.commons.logging.LogFactory;
37 import org.apache.hadoop.conf.Configurable;
38 import org.apache.hadoop.conf.Configuration;
39 import org.apache.hadoop.fs.FSDataOutputStream;
40 import org.apache.hadoop.fs.FileStatus;
41 import org.apache.hadoop.fs.FileSystem;
42 import org.apache.hadoop.fs.Path;
43 import org.apache.hadoop.hbase.Cell;
44 import org.apache.hadoop.hbase.CellUtil;
45 import org.apache.hadoop.hbase.HBaseConfiguration;
46 import org.apache.hadoop.hbase.HBaseTestingUtility;
47 import org.apache.hadoop.hbase.HConstants;
48 import org.apache.hadoop.hbase.testclassification.LargeTests;
49 import org.apache.hadoop.hbase.TableName;
50 import org.apache.hadoop.hbase.TableNotFoundException;
51 import org.apache.hadoop.hbase.client.HTable;
52 import org.apache.hadoop.hbase.client.Result;
53 import org.apache.hadoop.hbase.client.ResultScanner;
54 import org.apache.hadoop.hbase.client.Scan;
55 import org.apache.hadoop.hbase.client.Table;
56 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
57 import org.apache.hadoop.hbase.io.hfile.HFile;
58 import org.apache.hadoop.hbase.io.hfile.HFileScanner;
59 import org.apache.hadoop.hbase.util.Bytes;
60 import org.apache.hadoop.io.Text;
61 import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
62 import org.apache.hadoop.mapreduce.Job;
63 import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
64 import org.apache.hadoop.util.GenericOptionsParser;
65 import org.apache.hadoop.util.Tool;
66 import org.apache.hadoop.util.ToolRunner;
67 import org.junit.AfterClass;
68 import org.junit.Before;
69 import org.junit.BeforeClass;
70 import org.junit.Rule;
71 import org.junit.Test;
72 import org.junit.experimental.categories.Category;
73 import org.junit.rules.ExpectedException;
74
75 @Category(LargeTests.class)
76 public class TestImportTsv implements Configurable {
77
78 private static final Log LOG = LogFactory.getLog(TestImportTsv.class);
79 protected static final String NAME = TestImportTsv.class.getSimpleName();
80 protected static HBaseTestingUtility util = new HBaseTestingUtility();
81
82
83 protected static final String DELETE_AFTER_LOAD_CONF = NAME + ".deleteAfterLoad";
84
85
86
87
88 protected static final String FORCE_COMBINER_CONF = NAME + ".forceCombiner";
89
90 private final String FAMILY = "FAM";
91 private String table;
92 private Map<String, String> args;
93
94 @Rule
95 public ExpectedException exception = ExpectedException.none();
96
97 public Configuration getConf() {
98 return util.getConfiguration();
99 }
100
101 public void setConf(Configuration conf) {
102 throw new IllegalArgumentException("setConf not supported");
103 }
104
105 @BeforeClass
106 public static void provisionCluster() throws Exception {
107 util.setJobWithoutMRCluster();
108 util.startMiniCluster();
109 }
110
111 @AfterClass
112 public static void releaseCluster() throws Exception {
113 util.shutdownMiniCluster();
114 }
115
116 @Before
117 public void setup() throws Exception {
118 table = "test-" + UUID.randomUUID();
119 args = new HashMap<String, String>();
120
121 args.put(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,FAM:A,FAM:B");
122 args.put(ImportTsv.SEPARATOR_CONF_KEY, "\u001b");
123 }
124
125 @Test
126 public void testMROnTable() throws Exception {
127 util.createTable(TableName.valueOf(table), FAMILY);
128 doMROnTableTest(null, 1);
129 util.deleteTable(table);
130 }
131
132 @Test
133 public void testMROnTableWithTimestamp() throws Exception {
134 util.createTable(TableName.valueOf(table), FAMILY);
135 args.put(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,HBASE_TS_KEY,FAM:A,FAM:B");
136 args.put(ImportTsv.SEPARATOR_CONF_KEY, ",");
137 String data = "KEY,1234,VALUE1,VALUE2\n";
138
139 doMROnTableTest(data, 1);
140 util.deleteTable(table);
141 }
142
143 @Test
144 public void testMROnTableWithCustomMapper()
145 throws Exception {
146 util.createTable(TableName.valueOf(table), FAMILY);
147 args.put(ImportTsv.MAPPER_CONF_KEY,
148 "org.apache.hadoop.hbase.mapreduce.TsvImporterCustomTestMapper");
149
150 doMROnTableTest(null, 3);
151 util.deleteTable(table);
152 }
153
154 @Test
155 public void testBulkOutputWithoutAnExistingTable() throws Exception {
156
157 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
158 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
159
160 doMROnTableTest(null, 3);
161 util.deleteTable(table);
162 }
163
164 @Test
165 public void testBulkOutputWithAnExistingTable() throws Exception {
166 util.createTable(TableName.valueOf(table), FAMILY);
167
168
169 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
170 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
171
172 doMROnTableTest(null, 3);
173 util.deleteTable(table);
174 }
175
176 @Test
177 public void testBulkOutputWithAnExistingTableNoStrictTrue() throws Exception {
178 util.createTable(TableName.valueOf(table), FAMILY);
179
180
181 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
182 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
183 args.put(ImportTsv.NO_STRICT_COL_FAMILY, "true");
184 doMROnTableTest(null, 3);
185 util.deleteTable(table);
186 }
187
188 @Test
189 public void testJobConfigurationsWithTsvImporterTextMapper() throws Exception {
190 Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table),"hfiles");
191 String INPUT_FILE = "InputFile1.csv";
192
193 String[] args =
194 new String[] {
195 "-D" + ImportTsv.MAPPER_CONF_KEY
196 + "=org.apache.hadoop.hbase.mapreduce.TsvImporterTextMapper",
197 "-D" + ImportTsv.COLUMNS_CONF_KEY
198 + "=HBASE_ROW_KEY,FAM:A,FAM:B",
199 "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=,",
200 "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + bulkOutputPath.toString(), table,
201 INPUT_FILE
202 };
203 Configuration conf = new Configuration(util.getConfiguration());
204 GenericOptionsParser opts = new GenericOptionsParser(conf, args);
205 args = opts.getRemainingArgs();
206 assertEquals("running test job configuration failed.", 0,
207 ToolRunner.run(conf, new ImportTsv() {
208 @Override
209 public int run(String[] args) throws Exception {
210 Job job = createSubmittableJob(getConf(), args);
211 assertTrue(job.getMapperClass().equals(TsvImporterTextMapper.class));
212 assertTrue(job.getReducerClass().equals(TextSortReducer.class));
213 assertTrue(job.getMapOutputValueClass().equals(Text.class));
214 return 0;
215 }
216 }, args));
217
218 util.deleteTable(table);
219 }
220
221 @Test
222 public void testBulkOutputWithTsvImporterTextMapper() throws Exception {
223 Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table),"hfiles");
224 args.put(ImportTsv.MAPPER_CONF_KEY, "org.apache.hadoop.hbase.mapreduce.TsvImporterTextMapper");
225 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, bulkOutputPath.toString());
226 String data = "KEY\u001bVALUE4\u001bVALUE8\n";
227 doMROnTableTest(data, 4);
228 util.deleteTable(table);
229 }
230
231 @Test
232 public void testWithoutAnExistingTableAndCreateTableSetToNo() throws Exception {
233 String[] args = new String[] { table, "/inputFile" };
234
235 Configuration conf = new Configuration(util.getConfiguration());
236 conf.set(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,FAM:A");
237 conf.set(ImportTsv.BULK_OUTPUT_CONF_KEY, "/output");
238 conf.set(ImportTsv.CREATE_TABLE_CONF_KEY, "no");
239 exception.expect(TableNotFoundException.class);
240 assertEquals("running test job configuration failed.", 0,
241 ToolRunner.run(conf, new ImportTsv() {
242 @Override public int run(String[] args) throws Exception {
243 createSubmittableJob(getConf(), args);
244 return 0;
245 }
246 }, args));
247 }
248
249 @Test
250 public void testMRWithoutAnExistingTable() throws Exception {
251 String[] args =
252 new String[] { table, "/inputFile" };
253
254 exception.expect(TableNotFoundException.class);
255 assertEquals("running test job configuration failed.", 0, ToolRunner.run(
256 new Configuration(util.getConfiguration()),
257 new ImportTsv() {
258 @Override
259 public int run(String[] args) throws Exception {
260 createSubmittableJob(getConf(), args);
261 return 0;
262 }
263 }, args));
264 }
265
266 @Test
267 public void testJobConfigurationsWithDryMode() throws Exception {
268 Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table),"hfiles");
269 String INPUT_FILE = "InputFile1.csv";
270
271 String[] argsArray = new String[] {
272 "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
273 "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=,",
274 "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + bulkOutputPath.toString(),
275 "-D" + ImportTsv.DRY_RUN_CONF_KEY + "=true",
276 table,
277 INPUT_FILE };
278 assertEquals("running test job configuration failed.", 0, ToolRunner.run(
279 new Configuration(util.getConfiguration()),
280 new ImportTsv() {
281 @Override
282 public int run(String[] args) throws Exception {
283 Job job = createSubmittableJob(getConf(), args);
284 assertTrue(job.getOutputFormatClass().equals(NullOutputFormat.class));
285 return 0;
286 }
287 }, argsArray));
288
289 util.deleteTable(table);
290 }
291
292 @Test
293 public void testDryModeWithoutBulkOutputAndTableExists() throws Exception {
294 util.createTable(TableName.valueOf(table), FAMILY);
295 args.put(ImportTsv.DRY_RUN_CONF_KEY, "true");
296 doMROnTableTest(null, 1);
297
298
299 util.deleteTable(table);
300 }
301
302
303
304
305
306 @Test
307 public void testDryModeWithoutBulkOutputAndTableDoesNotExists() throws Exception {
308 args.put(ImportTsv.DRY_RUN_CONF_KEY, "true");
309 exception.expect(TableNotFoundException.class);
310 doMROnTableTest(null, 1);
311 }
312
313 @Test public void testDryModeWithBulkOutputAndTableExists() throws Exception {
314 util.createTable(TableName.valueOf(table), FAMILY);
315
316 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
317 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
318 args.put(ImportTsv.DRY_RUN_CONF_KEY, "true");
319 doMROnTableTest(null, 1);
320
321
322 util.deleteTable(table);
323 }
324
325
326
327
328
329 @Test
330 public void testDryModeWithBulkOutputAndTableDoesNotExistsCreateTableSetToNo() throws
331 Exception {
332
333 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
334 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
335 args.put(ImportTsv.DRY_RUN_CONF_KEY, "true");
336 args.put(ImportTsv.CREATE_TABLE_CONF_KEY, "no");
337 exception.expect(TableNotFoundException.class);
338 doMROnTableTest(null, 1);
339 }
340
341 @Test
342 public void testDryModeWithBulkModeAndTableDoesNotExistsCreateTableSetToYes() throws Exception {
343
344 Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
345 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles.toString());
346 args.put(ImportTsv.DRY_RUN_CONF_KEY, "true");
347 args.put(ImportTsv.CREATE_TABLE_CONF_KEY, "yes");
348 doMROnTableTest(null, 1);
349
350 exception.expect(TableNotFoundException.class);
351 util.deleteTable(table);
352 }
353
354
355
356
357 @Test
358 public void testTsvImporterTextMapperWithInvalidData() throws Exception {
359 Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
360 args.put(ImportTsv.MAPPER_CONF_KEY, "org.apache.hadoop.hbase.mapreduce.TsvImporterTextMapper");
361 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, bulkOutputPath.toString());
362 args.put(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,HBASE_TS_KEY,FAM:A,FAM:B");
363 args.put(ImportTsv.SEPARATOR_CONF_KEY, ",");
364
365 String data = "KEY,1234,VALUE1,VALUE2\nKEY\nKEY,1235,VALUE1,VALUE2\n";
366 doMROnTableTest(data, 1, 4);
367 util.deleteTable(table);
368 }
369
370 @Test
371 public void testSkipEmptyColumns() throws Exception {
372 Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
373 args.put(ImportTsv.BULK_OUTPUT_CONF_KEY, bulkOutputPath.toString());
374 args.put(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,HBASE_TS_KEY,FAM:A,FAM:B");
375 args.put(ImportTsv.SEPARATOR_CONF_KEY, ",");
376 args.put(ImportTsv.SKIP_EMPTY_COLUMNS, "true");
377
378 String data = "KEY,1234,VALUE1,VALUE2\nKEY,1235,,VALUE2\n";
379 doMROnTableTest(util, table, FAMILY, data, args, 1, 3);
380 util.deleteTable(table);
381 }
382
383 private Tool doMROnTableTest(String data, int valueMultiplier,int expectedKVCount)
384 throws Exception {
385 return doMROnTableTest(util, table, FAMILY, data, args, valueMultiplier,expectedKVCount);
386 }
387
388 private Tool doMROnTableTest(String data, int valueMultiplier) throws Exception {
389 return doMROnTableTest(util, table, FAMILY, data, args, valueMultiplier,-1);
390 }
391
392 protected static Tool doMROnTableTest(HBaseTestingUtility util, String table,
393 String family, String data, Map<String, String> args) throws Exception {
394 return doMROnTableTest(util, table, family, data, args, 1,-1);
395 }
396
397
398
399
400
401
402
403
404
405 protected static Tool doMROnTableTest(HBaseTestingUtility util, String table,
406 String family, String data, Map<String, String> args, int valueMultiplier,int expectedKVCount)
407 throws Exception {
408 Configuration conf = new Configuration(util.getConfiguration());
409
410
411 FileSystem fs = FileSystem.get(conf);
412 Path inputPath = fs.makeQualified(new Path(util.getDataTestDirOnTestFS(table), "input.dat"));
413 FSDataOutputStream op = fs.create(inputPath, true);
414 if (data == null) {
415 data = "KEY\u001bVALUE1\u001bVALUE2\n";
416 }
417 op.write(Bytes.toBytes(data));
418 op.close();
419 LOG.debug(String.format("Wrote test data to file: %s", inputPath));
420
421 if (conf.getBoolean(FORCE_COMBINER_CONF, true)) {
422 LOG.debug("Forcing combiner.");
423 conf.setInt("mapreduce.map.combine.minspills", 1);
424 }
425
426
427 String[] argsArray = new String[args.size() + 2];
428 Iterator it = args.entrySet().iterator();
429 int i = 0;
430 while (it.hasNext()) {
431 Map.Entry pair = (Map.Entry) it.next();
432 argsArray[i] = "-D" + pair.getKey() + "=" + pair.getValue();
433 i++;
434 }
435 argsArray[i] = table;
436 argsArray[i + 1] = inputPath.toString();
437
438
439 Tool tool = new ImportTsv();
440 LOG.debug("Running ImportTsv with arguments: " + Arrays.toString(argsArray));
441 assertEquals(0, ToolRunner.run(conf, tool, argsArray));
442
443
444
445
446 boolean isDryRun = args.containsKey(ImportTsv.DRY_RUN_CONF_KEY) &&
447 "true".equalsIgnoreCase(args.get(ImportTsv.DRY_RUN_CONF_KEY));
448 if (args.containsKey(ImportTsv.BULK_OUTPUT_CONF_KEY)) {
449 if (isDryRun) {
450 assertFalse(String.format("Dry run mode, %s should not have been created.",
451 ImportTsv.BULK_OUTPUT_CONF_KEY),
452 fs.exists(new Path(ImportTsv.BULK_OUTPUT_CONF_KEY)));
453 } else {
454 validateHFiles(fs, args.get(ImportTsv.BULK_OUTPUT_CONF_KEY), family,expectedKVCount);
455 }
456 } else {
457 validateTable(conf, TableName.valueOf(table), family, valueMultiplier, isDryRun);
458 }
459
460 if (conf.getBoolean(DELETE_AFTER_LOAD_CONF, true)) {
461 LOG.debug("Deleting test subdirectory");
462 util.cleanupDataTestDirOnTestFS(table);
463 }
464 return tool;
465 }
466
467
468
469
470 private static void validateTable(Configuration conf, TableName tableName,
471 String family, int valueMultiplier, boolean isDryRun) throws IOException {
472
473 LOG.debug("Validating table.");
474 Table table = new HTable(conf, tableName);
475 boolean verified = false;
476 long pause = conf.getLong("hbase.client.pause", 5 * 1000);
477 int numRetries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 5);
478 for (int i = 0; i < numRetries; i++) {
479 try {
480 Scan scan = new Scan();
481
482 scan.addFamily(Bytes.toBytes(family));
483 ResultScanner resScanner = table.getScanner(scan);
484 int numRows = 0;
485 for (Result res : resScanner) {
486 numRows++;
487 assertEquals(2, res.size());
488 List<Cell> kvs = res.listCells();
489 assertTrue(CellUtil.matchingRow(kvs.get(0), Bytes.toBytes("KEY")));
490 assertTrue(CellUtil.matchingRow(kvs.get(1), Bytes.toBytes("KEY")));
491 assertTrue(CellUtil.matchingValue(kvs.get(0), Bytes.toBytes("VALUE" + valueMultiplier)));
492 assertTrue(CellUtil.matchingValue(kvs.get(1), Bytes.toBytes("VALUE" + 2 * valueMultiplier)));
493
494 }
495 if (isDryRun) {
496 assertEquals(0, numRows);
497 } else {
498 assertEquals(1, numRows);
499 }
500 verified = true;
501 break;
502 } catch (NullPointerException e) {
503
504
505 }
506 try {
507 Thread.sleep(pause);
508 } catch (InterruptedException e) {
509
510 }
511 }
512 table.close();
513 assertTrue(verified);
514 }
515
516
517
518
519 private static void validateHFiles(FileSystem fs, String outputPath, String family,
520 int expectedKVCount) throws IOException {
521
522 LOG.debug("Validating HFiles.");
523 Set<String> configFamilies = new HashSet<String>();
524 configFamilies.add(family);
525 Set<String> foundFamilies = new HashSet<String>();
526 int actualKVCount = 0;
527 for (FileStatus cfStatus : fs.listStatus(new Path(outputPath), new OutputFilesFilter())) {
528 String[] elements = cfStatus.getPath().toString().split(Path.SEPARATOR);
529 String cf = elements[elements.length - 1];
530 foundFamilies.add(cf);
531 assertTrue(
532 String.format(
533 "HFile output contains a column family (%s) not present in input families (%s)",
534 cf, configFamilies),
535 configFamilies.contains(cf));
536 for (FileStatus hfile : fs.listStatus(cfStatus.getPath())) {
537 assertTrue(
538 String.format("HFile %s appears to contain no data.", hfile.getPath()),
539 hfile.getLen() > 0);
540
541 if (expectedKVCount > -1) {
542 actualKVCount += getKVCountFromHfile(fs, hfile.getPath());
543 }
544 }
545 }
546 assertTrue(String.format("HFile output does not contain the input family '%s'.", family),
547 foundFamilies.contains(family));
548 if (expectedKVCount > -1) {
549 assertTrue(String.format(
550 "KV count in ouput hfile=<%d> doesn't match with expected KV count=<%d>", actualKVCount,
551 expectedKVCount), actualKVCount == expectedKVCount);
552 }
553 }
554
555
556
557
558
559
560
561
562 private static int getKVCountFromHfile(FileSystem fs, Path p) throws IOException {
563 Configuration conf = util.getConfiguration();
564 HFile.Reader reader = HFile.createReader(fs, p, new CacheConfig(conf), conf);
565 reader.loadFileInfo();
566 HFileScanner scanner = reader.getScanner(false, false);
567 scanner.seekTo();
568 int count = 0;
569 do {
570 count++;
571 } while (scanner.next());
572 reader.close();
573 return count;
574 }
575 }
576