View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import static java.lang.String.format;
22  
23  import com.google.common.base.Preconditions;
24  import com.google.common.base.Splitter;
25  import com.google.common.collect.Lists;
26  
27  import org.apache.commons.lang.StringUtils;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.conf.Configured;
32  import org.apache.hadoop.fs.Path;
33  import org.apache.hadoop.hbase.HBaseConfiguration;
34  import org.apache.hadoop.hbase.HColumnDescriptor;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.HTableDescriptor;
37  import org.apache.hadoop.hbase.TableName;
38  import org.apache.hadoop.hbase.TableNotEnabledException;
39  import org.apache.hadoop.hbase.TableNotFoundException;
40  import org.apache.hadoop.hbase.classification.InterfaceAudience;
41  import org.apache.hadoop.hbase.classification.InterfaceStability;
42  import org.apache.hadoop.hbase.client.Admin;
43  import org.apache.hadoop.hbase.client.Connection;
44  import org.apache.hadoop.hbase.client.ConnectionFactory;
45  import org.apache.hadoop.hbase.client.Put;
46  import org.apache.hadoop.hbase.client.RegionLocator;
47  import org.apache.hadoop.hbase.client.Table;
48  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
49  import org.apache.hadoop.hbase.util.Base64;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.hbase.util.Pair;
52  import org.apache.hadoop.io.Text;
53  import org.apache.hadoop.mapreduce.Job;
54  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
55  import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
56  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
57  import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
58  import org.apache.hadoop.security.Credentials;
59  import org.apache.hadoop.util.GenericOptionsParser;
60  import org.apache.hadoop.util.Tool;
61  import org.apache.hadoop.util.ToolRunner;
62  
63  import java.io.File;
64  import java.io.IOException;
65  import java.util.ArrayList;
66  import java.util.HashSet;
67  import java.util.Set;
68  
69  /**
70   * Tool to import data from a TSV file.
71   *
72   * This tool is rather simplistic - it doesn't do any quoting or
73   * escaping, but is useful for many data loads.
74   *
75   * @see ImportTsv#usage(String)
76   */
77  @InterfaceAudience.Public
78  @InterfaceStability.Stable
79  public class ImportTsv extends Configured implements Tool {
80  
81    protected static final Log LOG = LogFactory.getLog(ImportTsv.class);
82  
83    final static String NAME = "importtsv";
84  
85    public final static String MAPPER_CONF_KEY = "importtsv.mapper.class";
86    public final static String BULK_OUTPUT_CONF_KEY = "importtsv.bulk.output";
87    public final static String TIMESTAMP_CONF_KEY = "importtsv.timestamp";
88    public final static String JOB_NAME_CONF_KEY = "mapreduce.job.name";
89    // TODO: the rest of these configs are used exclusively by TsvImporterMapper.
90    // Move them out of the tool and let the mapper handle its own validation.
91    public final static String DRY_RUN_CONF_KEY = "importtsv.dry.run";
92    // If true, bad lines are logged to stderr. Default: false.
93    public final static String LOG_BAD_LINES_CONF_KEY = "importtsv.log.bad.lines";
94    public final static String SKIP_LINES_CONF_KEY = "importtsv.skip.bad.lines";
95    public final static String SKIP_EMPTY_COLUMNS = "importtsv.skip.empty.columns";
96    public final static String COLUMNS_CONF_KEY = "importtsv.columns";
97    public final static String SEPARATOR_CONF_KEY = "importtsv.separator";
98    public final static String ATTRIBUTE_SEPERATOR_CONF_KEY = "attributes.seperator";
99    //This config is used to propagate credentials from parent MR jobs which launch
100   //ImportTSV jobs. SEE IntegrationTestImportTsv.
101   public final static String CREDENTIALS_LOCATION = "credentials_location";
102   final static String DEFAULT_SEPARATOR = "\t";
103   final static String DEFAULT_ATTRIBUTES_SEPERATOR = "=>";
104   final static String DEFAULT_MULTIPLE_ATTRIBUTES_SEPERATOR = ",";
105   final static Class DEFAULT_MAPPER = TsvImporterMapper.class;
106   public final static String CREATE_TABLE_CONF_KEY = "create.table";
107   public final static String NO_STRICT_COL_FAMILY = "no.strict";
108   /**
109    * If table didn't exist and was created in dry-run mode, this flag is
110    * flipped to delete it when MR ends.
111    */
112   private static boolean DRY_RUN_TABLE_CREATED;
113 
114   public static class TsvParser {
115     /**
116      * Column families and qualifiers mapped to the TSV columns
117      */
118     private final byte[][] families;
119     private final byte[][] qualifiers;
120 
121     private final byte separatorByte;
122 
123     private int rowKeyColumnIndex;
124 
125     private int maxColumnCount;
126 
127     // Default value must be negative
128     public static final int DEFAULT_TIMESTAMP_COLUMN_INDEX = -1;
129 
130     private int timestampKeyColumnIndex = DEFAULT_TIMESTAMP_COLUMN_INDEX;
131 
132     public static final String ROWKEY_COLUMN_SPEC = "HBASE_ROW_KEY";
133 
134     public static final String TIMESTAMPKEY_COLUMN_SPEC = "HBASE_TS_KEY";
135 
136     public static final String ATTRIBUTES_COLUMN_SPEC = "HBASE_ATTRIBUTES_KEY";
137 
138     public static final String CELL_VISIBILITY_COLUMN_SPEC = "HBASE_CELL_VISIBILITY";
139 
140     public static final String CELL_TTL_COLUMN_SPEC = "HBASE_CELL_TTL";
141 
142     private int attrKeyColumnIndex = DEFAULT_ATTRIBUTES_COLUMN_INDEX;
143 
144     public static final int DEFAULT_ATTRIBUTES_COLUMN_INDEX = -1;
145 
146     public static final int DEFAULT_CELL_VISIBILITY_COLUMN_INDEX = -1;
147 
148     public static final int DEFAULT_CELL_TTL_COLUMN_INDEX = -1;
149 
150     private int cellVisibilityColumnIndex = DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
151 
152     private int cellTTLColumnIndex = DEFAULT_CELL_TTL_COLUMN_INDEX;
153 
154     /**
155      * @param columnsSpecification the list of columns to parser out, comma separated.
156      * The row key should be the special token TsvParser.ROWKEY_COLUMN_SPEC
157      * @param separatorStr
158      */
159     public TsvParser(String columnsSpecification, String separatorStr) {
160       // Configure separator
161       byte[] separator = Bytes.toBytes(separatorStr);
162       Preconditions.checkArgument(separator.length == 1,
163         "TsvParser only supports single-byte separators");
164       separatorByte = separator[0];
165 
166       // Configure columns
167       ArrayList<String> columnStrings = Lists.newArrayList(
168         Splitter.on(',').trimResults().split(columnsSpecification));
169 
170       maxColumnCount = columnStrings.size();
171       families = new byte[maxColumnCount][];
172       qualifiers = new byte[maxColumnCount][];
173 
174       for (int i = 0; i < columnStrings.size(); i++) {
175         String str = columnStrings.get(i);
176         if (ROWKEY_COLUMN_SPEC.equals(str)) {
177           rowKeyColumnIndex = i;
178           continue;
179         }
180         if (TIMESTAMPKEY_COLUMN_SPEC.equals(str)) {
181           timestampKeyColumnIndex = i;
182           continue;
183         }
184         if (ATTRIBUTES_COLUMN_SPEC.equals(str)) {
185           attrKeyColumnIndex = i;
186           continue;
187         }
188         if (CELL_VISIBILITY_COLUMN_SPEC.equals(str)) {
189           cellVisibilityColumnIndex = i;
190           continue;
191         }
192         if (CELL_TTL_COLUMN_SPEC.equals(str)) {
193           cellTTLColumnIndex = i;
194           continue;
195         }
196         String[] parts = str.split(":", 2);
197         if (parts.length == 1) {
198           families[i] = str.getBytes();
199           qualifiers[i] = HConstants.EMPTY_BYTE_ARRAY;
200         } else {
201           families[i] = parts[0].getBytes();
202           qualifiers[i] = parts[1].getBytes();
203         }
204       }
205     }
206 
207     public boolean hasTimestamp() {
208       return timestampKeyColumnIndex != DEFAULT_TIMESTAMP_COLUMN_INDEX;
209     }
210 
211     public int getTimestampKeyColumnIndex() {
212       return timestampKeyColumnIndex;
213     }
214 
215     public boolean hasAttributes() {
216       return attrKeyColumnIndex != DEFAULT_ATTRIBUTES_COLUMN_INDEX;
217     }
218 
219     public boolean hasCellVisibility() {
220       return cellVisibilityColumnIndex != DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
221     }
222 
223     public boolean hasCellTTL() {
224       return cellTTLColumnIndex != DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
225     }
226 
227     public int getAttributesKeyColumnIndex() {
228       return attrKeyColumnIndex;
229     }
230 
231     public int getCellVisibilityColumnIndex() {
232       return cellVisibilityColumnIndex;
233     }
234 
235     public int getCellTTLColumnIndex() {
236       return cellTTLColumnIndex;
237     }
238 
239     public int getRowKeyColumnIndex() {
240       return rowKeyColumnIndex;
241     }
242 
243     public byte[] getFamily(int idx) {
244       return families[idx];
245     }
246     public byte[] getQualifier(int idx) {
247       return qualifiers[idx];
248     }
249 
250     public ParsedLine parse(byte[] lineBytes, int length)
251     throws BadTsvLineException {
252       // Enumerate separator offsets
253       ArrayList<Integer> tabOffsets = new ArrayList<Integer>(maxColumnCount);
254       for (int i = 0; i < length; i++) {
255         if (lineBytes[i] == separatorByte) {
256           tabOffsets.add(i);
257         }
258       }
259       if (tabOffsets.isEmpty()) {
260         throw new BadTsvLineException("No delimiter");
261       }
262 
263       tabOffsets.add(length);
264 
265       if (tabOffsets.size() > maxColumnCount) {
266         throw new BadTsvLineException("Excessive columns");
267       } else if (tabOffsets.size() <= getRowKeyColumnIndex()) {
268         throw new BadTsvLineException("No row key");
269       } else if (hasTimestamp()
270           && tabOffsets.size() <= getTimestampKeyColumnIndex()) {
271         throw new BadTsvLineException("No timestamp");
272       } else if (hasAttributes() && tabOffsets.size() <= getAttributesKeyColumnIndex()) {
273         throw new BadTsvLineException("No attributes specified");
274       } else if (hasCellVisibility() && tabOffsets.size() <= getCellVisibilityColumnIndex()) {
275         throw new BadTsvLineException("No cell visibility specified");
276       } else if (hasCellTTL() && tabOffsets.size() <= getCellTTLColumnIndex()) {
277         throw new BadTsvLineException("No cell TTL specified");
278       }
279       return new ParsedLine(tabOffsets, lineBytes);
280     }
281 
282     class ParsedLine {
283       private final ArrayList<Integer> tabOffsets;
284       private byte[] lineBytes;
285 
286       ParsedLine(ArrayList<Integer> tabOffsets, byte[] lineBytes) {
287         this.tabOffsets = tabOffsets;
288         this.lineBytes = lineBytes;
289       }
290 
291       public int getRowKeyOffset() {
292         return getColumnOffset(rowKeyColumnIndex);
293       }
294       public int getRowKeyLength() {
295         return getColumnLength(rowKeyColumnIndex);
296       }
297 
298       public long getTimestamp(long ts) throws BadTsvLineException {
299         // Return ts if HBASE_TS_KEY is not configured in column spec
300         if (!hasTimestamp()) {
301           return ts;
302         }
303 
304         String timeStampStr = Bytes.toString(lineBytes,
305             getColumnOffset(timestampKeyColumnIndex),
306             getColumnLength(timestampKeyColumnIndex));
307         try {
308           return Long.parseLong(timeStampStr);
309         } catch (NumberFormatException nfe) {
310           // treat this record as bad record
311           throw new BadTsvLineException("Invalid timestamp " + timeStampStr);
312         }
313       }
314 
315       private String getAttributes() {
316         if (!hasAttributes()) {
317           return null;
318         } else {
319           return Bytes.toString(lineBytes, getColumnOffset(attrKeyColumnIndex),
320               getColumnLength(attrKeyColumnIndex));
321         }
322       }
323 
324       public String[] getIndividualAttributes() {
325         String attributes = getAttributes();
326         if (attributes != null) {
327           return attributes.split(DEFAULT_MULTIPLE_ATTRIBUTES_SEPERATOR);
328         } else {
329           return null;
330         }
331       }
332 
333       public int getAttributeKeyOffset() {
334         if (hasAttributes()) {
335           return getColumnOffset(attrKeyColumnIndex);
336         } else {
337           return DEFAULT_ATTRIBUTES_COLUMN_INDEX;
338         }
339       }
340 
341       public int getAttributeKeyLength() {
342         if (hasAttributes()) {
343           return getColumnLength(attrKeyColumnIndex);
344         } else {
345           return DEFAULT_ATTRIBUTES_COLUMN_INDEX;
346         }
347       }
348 
349       public int getCellVisibilityColumnOffset() {
350         if (hasCellVisibility()) {
351           return getColumnOffset(cellVisibilityColumnIndex);
352         } else {
353           return DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
354         }
355       }
356 
357       public int getCellVisibilityColumnLength() {
358         if (hasCellVisibility()) {
359           return getColumnLength(cellVisibilityColumnIndex);
360         } else {
361           return DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
362         }
363       }
364 
365       public String getCellVisibility() {
366         if (!hasCellVisibility()) {
367           return null;
368         } else {
369           return Bytes.toString(lineBytes, getColumnOffset(cellVisibilityColumnIndex),
370               getColumnLength(cellVisibilityColumnIndex));
371         }
372       }
373 
374       public int getCellTTLColumnOffset() {
375         if (hasCellTTL()) {
376           return getColumnOffset(cellTTLColumnIndex);
377         } else {
378           return DEFAULT_CELL_TTL_COLUMN_INDEX;
379         }
380       }
381 
382       public int getCellTTLColumnLength() {
383         if (hasCellTTL()) {
384           return getColumnLength(cellTTLColumnIndex);
385         } else {
386           return DEFAULT_CELL_TTL_COLUMN_INDEX;
387         }
388       }
389 
390       public long getCellTTL() {
391         if (!hasCellTTL()) {
392           return 0;
393         } else {
394           return Bytes.toLong(lineBytes, getColumnOffset(cellTTLColumnIndex),
395               getColumnLength(cellTTLColumnIndex));
396         }
397       }
398 
399       public int getColumnOffset(int idx) {
400         if (idx > 0)
401           return tabOffsets.get(idx - 1) + 1;
402         else
403           return 0;
404       }
405       public int getColumnLength(int idx) {
406         return tabOffsets.get(idx) - getColumnOffset(idx);
407       }
408       public int getColumnCount() {
409         return tabOffsets.size();
410       }
411       public byte[] getLineBytes() {
412         return lineBytes;
413       }
414     }
415 
416     public static class BadTsvLineException extends Exception {
417       public BadTsvLineException(String err) {
418         super(err);
419       }
420       private static final long serialVersionUID = 1L;
421     }
422 
423     /**
424      * Return starting position and length of row key from the specified line bytes.
425      * @param lineBytes
426      * @param length
427      * @return Pair of row key offset and length.
428      * @throws BadTsvLineException
429      */
430     public Pair<Integer, Integer> parseRowKey(byte[] lineBytes, int length)
431         throws BadTsvLineException {
432       int rkColumnIndex = 0;
433       int startPos = 0, endPos = 0;
434       for (int i = 0; i <= length; i++) {
435         if (i == length || lineBytes[i] == separatorByte) {
436           endPos = i - 1;
437           if (rkColumnIndex++ == getRowKeyColumnIndex()) {
438             if ((endPos + 1) == startPos) {
439               throw new BadTsvLineException("Empty value for ROW KEY.");
440             }
441             break;
442           } else {
443             startPos = endPos + 2;
444           }
445         }
446         if (i == length) {
447           throw new BadTsvLineException(
448               "Row key does not exist as number of columns in the line"
449                   + " are less than row key position.");
450         }
451       }
452       return new Pair<Integer, Integer>(startPos, endPos - startPos + 1);
453     }
454   }
455 
456   /**
457    * Sets up the actual job.
458    *
459    * @param conf  The current configuration.
460    * @param args  The command line parameters.
461    * @return The newly created job.
462    * @throws IOException When setting up the job fails.
463    */
464   protected static Job createSubmittableJob(Configuration conf, String[] args)
465       throws IOException, ClassNotFoundException {
466     Job job = null;
467     boolean isDryRun = conf.getBoolean(DRY_RUN_CONF_KEY, false);
468     try (Connection connection = ConnectionFactory.createConnection(conf)) {
469       try (Admin admin = connection.getAdmin()) {
470         // Support non-XML supported characters
471         // by re-encoding the passed separator as a Base64 string.
472         String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
473         if (actualSeparator != null) {
474           conf.set(SEPARATOR_CONF_KEY,
475               Base64.encodeBytes(actualSeparator.getBytes()));
476         }
477 
478         // See if a non-default Mapper was set
479         String mapperClassName = conf.get(MAPPER_CONF_KEY);
480         Class mapperClass =
481             mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;
482 
483         TableName tableName = TableName.valueOf(args[0]);
484         Path inputDir = new Path(args[1]);
485         String jobName = conf.get(JOB_NAME_CONF_KEY,NAME + "_" + tableName.getNameAsString());
486         job = Job.getInstance(conf, jobName);
487         job.setJarByClass(mapperClass);
488         FileInputFormat.setInputPaths(job, inputDir);
489         job.setInputFormatClass(TextInputFormat.class);
490         job.setMapperClass(mapperClass);
491         job.setMapOutputKeyClass(ImmutableBytesWritable.class);
492         String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
493         String columns[] = conf.getStrings(COLUMNS_CONF_KEY);
494         if(StringUtils.isNotEmpty(conf.get(CREDENTIALS_LOCATION))) {
495           String fileLoc = conf.get(CREDENTIALS_LOCATION);
496           Credentials cred = Credentials.readTokenStorageFile(new File(fileLoc), conf);
497           job.getCredentials().addAll(cred);
498         }
499 
500         if (hfileOutPath != null) {
501           if (!admin.tableExists(tableName)) {
502             LOG.warn(format("Table '%s' does not exist.", tableName));
503             if ("yes".equalsIgnoreCase(conf.get(CREATE_TABLE_CONF_KEY, "yes"))) {
504               // TODO: this is backwards. Instead of depending on the existence of a table,
505               // create a sane splits file for HFileOutputFormat based on data sampling.
506               createTable(admin, tableName, columns);
507               if (isDryRun) {
508                 LOG.warn("Dry run: Table will be deleted at end of dry run.");
509                 synchronized (ImportTsv.class) {
510                   DRY_RUN_TABLE_CREATED = true;
511                 }
512               }
513             } else {
514               String errorMsg =
515                   format("Table '%s' does not exist and '%s' is set to no.", tableName,
516                       CREATE_TABLE_CONF_KEY);
517               LOG.error(errorMsg);
518               throw new TableNotFoundException(errorMsg);
519             }
520           }
521           try (Table table = connection.getTable(tableName);
522               RegionLocator regionLocator = connection.getRegionLocator(tableName)) {
523             boolean noStrict = conf.getBoolean(NO_STRICT_COL_FAMILY, false);
524             // if no.strict is false then check column family
525             if(!noStrict) {
526               ArrayList<String> unmatchedFamilies = new ArrayList<String>();
527               Set<String> cfSet = getColumnFamilies(columns);
528               HTableDescriptor tDesc = table.getTableDescriptor();
529               for (String cf : cfSet) {
530                 if(tDesc.getFamily(Bytes.toBytes(cf)) == null) {
531                   unmatchedFamilies.add(cf);
532                 }
533               }
534               if(unmatchedFamilies.size() > 0) {
535                 ArrayList<String> familyNames = new ArrayList<String>();
536                 for (HColumnDescriptor family : table.getTableDescriptor().getFamilies()) {
537                   familyNames.add(family.getNameAsString());
538                 }
539                 String msg =
540                     "Column Families " + unmatchedFamilies + " specified in " + COLUMNS_CONF_KEY
541                     + " does not match with any of the table " + tableName
542                     + " column families " + familyNames + ".\n"
543                     + "To disable column family check, use -D" + NO_STRICT_COL_FAMILY
544                     + "=true.\n";
545                 usage(msg);
546                 System.exit(-1);
547               }
548             }
549             if (mapperClass.equals(TsvImporterTextMapper.class)) {
550               job.setMapOutputValueClass(Text.class);
551               job.setReducerClass(TextSortReducer.class);
552             } else {
553               job.setMapOutputValueClass(Put.class);
554               job.setCombinerClass(PutCombiner.class);
555               job.setReducerClass(PutSortReducer.class);
556             }
557             if (!isDryRun) {
558               Path outputDir = new Path(hfileOutPath);
559               FileOutputFormat.setOutputPath(job, outputDir);
560               HFileOutputFormat2.configureIncrementalLoad(job, table.getTableDescriptor(),
561                   regionLocator);
562             }
563           }
564         } else {
565           if (!admin.tableExists(tableName)) {
566             String errorMsg = format("Table '%s' does not exist.", tableName);
567             LOG.error(errorMsg);
568             throw new TableNotFoundException(errorMsg);
569           }
570           if (mapperClass.equals(TsvImporterTextMapper.class)) {
571             usage(TsvImporterTextMapper.class.toString()
572                 + " should not be used for non bulkloading case. use "
573                 + TsvImporterMapper.class.toString()
574                 + " or custom mapper whose value type is Put.");
575             System.exit(-1);
576           }
577           if (!isDryRun) {
578             // No reducers. Just write straight to table. Call initTableReducerJob
579             // to set up the TableOutputFormat.
580             TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job);
581           }
582           job.setNumReduceTasks(0);
583         }
584         if (isDryRun) {
585           job.setOutputFormatClass(NullOutputFormat.class);
586           job.getConfiguration().setStrings("io.serializations",
587               job.getConfiguration().get("io.serializations"),
588               MutationSerialization.class.getName(), ResultSerialization.class.getName(),
589               KeyValueSerialization.class.getName());
590         }
591         TableMapReduceUtil.addDependencyJars(job);
592         TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(),
593             com.google.common.base.Function.class /* Guava used by TsvParser */);
594       }
595     }
596     return job;
597   }
598 
599   private static void createTable(Admin admin, TableName tableName, String[] columns)
600       throws IOException {
601     HTableDescriptor htd = new HTableDescriptor(tableName);
602     Set<String> cfSet = getColumnFamilies(columns);
603     for (String cf : cfSet) {
604       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toBytes(cf));
605       htd.addFamily(hcd);
606     }
607     LOG.warn(format("Creating table '%s' with '%s' columns and default descriptors.",
608       tableName, cfSet));
609     admin.createTable(htd);
610   }
611 
612   private static void deleteTable(Configuration conf, String[] args) {
613     TableName tableName = TableName.valueOf(args[0]);
614     try (Connection connection = ConnectionFactory.createConnection(conf);
615          Admin admin = connection.getAdmin()) {
616       try {
617         admin.disableTable(tableName);
618       } catch (TableNotEnabledException e) {
619         LOG.debug("Dry mode: Table: " + tableName + " already disabled, so just deleting it.");
620       }
621       admin.deleteTable(tableName);
622     } catch (IOException e) {
623       LOG.error(format("***Dry run: Failed to delete table '%s'.***%n%s", tableName,
624           e.toString()));
625       return;
626     }
627     LOG.info(format("Dry run: Deleted table '%s'.", tableName));
628   }
629 
630   private static Set<String> getColumnFamilies(String[] columns) {
631     Set<String> cfSet = new HashSet<String>();
632     for (String aColumn : columns) {
633       if (TsvParser.ROWKEY_COLUMN_SPEC.equals(aColumn)
634           || TsvParser.TIMESTAMPKEY_COLUMN_SPEC.equals(aColumn)
635           || TsvParser.CELL_VISIBILITY_COLUMN_SPEC.equals(aColumn)
636           || TsvParser.CELL_TTL_COLUMN_SPEC.equals(aColumn)
637           || TsvParser.ATTRIBUTES_COLUMN_SPEC.equals(aColumn))
638         continue;
639       // we are only concerned with the first one (in case this is a cf:cq)
640       cfSet.add(aColumn.split(":", 2)[0]);
641     }
642     return cfSet;
643   }
644 
645   /*
646    * @param errorMsg Error message.  Can be null.
647    */
648   private static void usage(final String errorMsg) {
649     if (errorMsg != null && errorMsg.length() > 0) {
650       System.err.println("ERROR: " + errorMsg);
651     }
652     String usage =
653       "Usage: " + NAME + " -D"+ COLUMNS_CONF_KEY + "=a,b,c <tablename> <inputdir>\n" +
654       "\n" +
655       "Imports the given input directory of TSV data into the specified table.\n" +
656       "\n" +
657       "The column names of the TSV data must be specified using the -D" + COLUMNS_CONF_KEY + "\n" +
658       "option. This option takes the form of comma-separated column names, where each\n" +
659       "column name is either a simple column family, or a columnfamily:qualifier. The special\n" +
660       "column name " + TsvParser.ROWKEY_COLUMN_SPEC + " is used to designate that this column should be used\n" +
661       "as the row key for each imported record. You must specify exactly one column\n" +
662       "to be the row key, and you must specify a column name for every column that exists in the\n" +
663       "input data. Another special column" + TsvParser.TIMESTAMPKEY_COLUMN_SPEC +
664       " designates that this column should be\n" +
665       "used as timestamp for each record. Unlike " + TsvParser.ROWKEY_COLUMN_SPEC + ", " +
666       TsvParser.TIMESTAMPKEY_COLUMN_SPEC + " is optional." + "\n" +
667       "You must specify at most one column as timestamp key for each imported record.\n" +
668       "Record with invalid timestamps (blank, non-numeric) will be treated as bad record.\n" +
669       "Note: if you use this option, then '" + TIMESTAMP_CONF_KEY + "' option will be ignored.\n" +
670       "\n" +
671       "Other special columns that can be specified are " + TsvParser.CELL_TTL_COLUMN_SPEC +
672       " and " + TsvParser.CELL_VISIBILITY_COLUMN_SPEC + ".\n" +
673       TsvParser.CELL_TTL_COLUMN_SPEC + " designates that this column will be used " +
674       "as a Cell's Time To Live (TTL) attribute.\n" +
675       TsvParser.CELL_VISIBILITY_COLUMN_SPEC + " designates that this column contains the " +
676       "visibility label expression.\n" +
677       "\n" +
678       TsvParser.ATTRIBUTES_COLUMN_SPEC+" can be used to specify Operation Attributes per record.\n"+
679       " Should be specified as key=>value where "+TsvParser.DEFAULT_ATTRIBUTES_COLUMN_INDEX+ " is used \n"+
680       " as the seperator.  Note that more than one OperationAttributes can be specified.\n"+
681       "By default importtsv will load data directly into HBase. To instead generate\n" +
682       "HFiles of data to prepare for a bulk data load, pass the option:\n" +
683       "  -D" + BULK_OUTPUT_CONF_KEY + "=/path/for/output\n" +
684       "  Note: if you do not use this option, then the target table must already exist in HBase\n" +
685       "\n" +
686       "Other options that may be specified with -D include:\n" +
687       "  -D" + DRY_RUN_CONF_KEY + "=true - Dry run mode. Data is not actually populated into" +
688       " table. If table does not exist, it is created but deleted in the end.\n" +
689       "  -D" + SKIP_LINES_CONF_KEY + "=false - fail if encountering an invalid line\n" +
690       "  -D" + LOG_BAD_LINES_CONF_KEY + "=true - logs invalid lines to stderr\n" +
691       "  -D" + SKIP_EMPTY_COLUMNS + "=false - If true then skip empty columns in bulk import\n" +
692       "  '-D" + SEPARATOR_CONF_KEY + "=|' - eg separate on pipes instead of tabs\n" +
693       "  -D" + TIMESTAMP_CONF_KEY + "=currentTimeAsLong - use the specified timestamp for the import\n" +
694       "  -D" + MAPPER_CONF_KEY + "=my.Mapper - A user-defined Mapper to use instead of " +
695       DEFAULT_MAPPER.getName() + "\n" +
696       "  -D" + JOB_NAME_CONF_KEY + "=jobName - use the specified mapreduce job name for the import\n" +
697       "  -D" + CREATE_TABLE_CONF_KEY + "=no - can be used to avoid creation of table by this tool\n" +
698       "  Note: if you set this to 'no', then the target table must already exist in HBase\n" +
699       "  -D" + NO_STRICT_COL_FAMILY + "=true - ignore column family check in hbase table. " +
700       "Default is false\n\n" +
701       "For performance consider the following options:\n" +
702       "  -Dmapreduce.map.speculative=false\n" +
703       "  -Dmapreduce.reduce.speculative=false";
704 
705     System.err.println(usage);
706   }
707 
708   @Override
709   public int run(String[] args) throws Exception {
710     setConf(HBaseConfiguration.create(getConf()));
711     String[] otherArgs = new GenericOptionsParser(getConf(), args).getRemainingArgs();
712     if (otherArgs.length < 2) {
713       usage("Wrong number of arguments: " + otherArgs.length);
714       return -1;
715     }
716 
717     // When MAPPER_CONF_KEY is null, the user wants to use the provided TsvImporterMapper, so
718     // perform validation on these additional args. When it's not null, user has provided their
719     // own mapper, thus these validation are not relevant.
720     // TODO: validation for TsvImporterMapper, not this tool. Move elsewhere.
721     if (null == getConf().get(MAPPER_CONF_KEY)) {
722       // Make sure columns are specified
723       String columns[] = getConf().getStrings(COLUMNS_CONF_KEY);
724       if (columns == null) {
725         usage("No columns specified. Please specify with -D" +
726             COLUMNS_CONF_KEY+"=...");
727         return -1;
728       }
729 
730       // Make sure they specify exactly one column as the row key
731       int rowkeysFound = 0;
732       for (String col : columns) {
733         if (col.equals(TsvParser.ROWKEY_COLUMN_SPEC)) rowkeysFound++;
734       }
735       if (rowkeysFound != 1) {
736         usage("Must specify exactly one column as " + TsvParser.ROWKEY_COLUMN_SPEC);
737         return -1;
738       }
739 
740       // Make sure we have at most one column as the timestamp key
741       int tskeysFound = 0;
742       for (String col : columns) {
743         if (col.equals(TsvParser.TIMESTAMPKEY_COLUMN_SPEC))
744           tskeysFound++;
745       }
746       if (tskeysFound > 1) {
747         usage("Must specify at most one column as "
748             + TsvParser.TIMESTAMPKEY_COLUMN_SPEC);
749         return -1;
750       }
751 
752       int attrKeysFound = 0;
753       for (String col : columns) {
754         if (col.equals(TsvParser.ATTRIBUTES_COLUMN_SPEC))
755           attrKeysFound++;
756       }
757       if (attrKeysFound > 1) {
758         usage("Must specify at most one column as "
759             + TsvParser.ATTRIBUTES_COLUMN_SPEC);
760         return -1;
761       }
762 
763       // Make sure one or more columns are specified excluding rowkey and
764       // timestamp key
765       if (columns.length - (rowkeysFound + tskeysFound + attrKeysFound) < 1) {
766         usage("One or more columns in addition to the row key and timestamp(optional) are required");
767         return -1;
768       }
769     }
770 
771     // If timestamp option is not specified, use current system time.
772     long timstamp = getConf().getLong(TIMESTAMP_CONF_KEY, System.currentTimeMillis());
773 
774     // Set it back to replace invalid timestamp (non-numeric) with current
775     // system time
776     getConf().setLong(TIMESTAMP_CONF_KEY, timstamp);
777 
778     synchronized (ImportTsv.class) {
779       DRY_RUN_TABLE_CREATED = false;
780     }
781     Job job = createSubmittableJob(getConf(), args);
782     boolean success = job.waitForCompletion(true);
783     boolean delete = false;
784     synchronized (ImportTsv.class) {
785       delete = DRY_RUN_TABLE_CREATED;
786     }
787     if (delete) {
788       deleteTable(getConf(), args);
789     }
790     return success ? 0 : 1;
791   }
792 
793   public static void main(String[] args) throws Exception {
794     int status = ToolRunner.run(new ImportTsv(), args);
795     System.exit(status);
796   }
797 }