View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.mapreduce;
19  
20  import java.io.IOException;
21  import java.io.UnsupportedEncodingException;
22  import java.net.InetSocketAddress;
23  import java.net.URLDecoder;
24  import java.net.URLEncoder;
25  import java.util.ArrayList;
26  import java.util.Collection;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.TreeMap;
30  import java.util.TreeSet;
31  import java.util.UUID;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FileSystem;
37  import org.apache.hadoop.fs.Path;
38  import org.apache.hadoop.hbase.Cell;
39  import org.apache.hadoop.hbase.CellUtil;
40  import org.apache.hadoop.hbase.HColumnDescriptor;
41  import org.apache.hadoop.hbase.HConstants;
42  import org.apache.hadoop.hbase.HRegionLocation;
43  import org.apache.hadoop.hbase.HTableDescriptor;
44  import org.apache.hadoop.hbase.KeyValue;
45  import org.apache.hadoop.hbase.KeyValueUtil;
46  import org.apache.hadoop.hbase.TableName;
47  import org.apache.hadoop.hbase.classification.InterfaceAudience;
48  import org.apache.hadoop.hbase.classification.InterfaceStability;
49  import org.apache.hadoop.hbase.client.Connection;
50  import org.apache.hadoop.hbase.client.ConnectionFactory;
51  import org.apache.hadoop.hbase.client.HTable;
52  import org.apache.hadoop.hbase.client.Put;
53  import org.apache.hadoop.hbase.client.RegionLocator;
54  import org.apache.hadoop.hbase.client.Table;
55  import org.apache.hadoop.hbase.fs.HFileSystem;
56  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
57  import org.apache.hadoop.hbase.io.compress.Compression;
58  import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
59  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
60  import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
61  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
62  import org.apache.hadoop.hbase.io.hfile.HFile;
63  import org.apache.hadoop.hbase.io.hfile.HFileContext;
64  import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
65  import org.apache.hadoop.hbase.regionserver.BloomType;
66  import org.apache.hadoop.hbase.regionserver.HStore;
67  import org.apache.hadoop.hbase.regionserver.StoreFile;
68  import org.apache.hadoop.hbase.util.Bytes;
69  import org.apache.hadoop.hbase.util.FSUtils;
70  import org.apache.hadoop.io.NullWritable;
71  import org.apache.hadoop.io.SequenceFile;
72  import org.apache.hadoop.io.Text;
73  import org.apache.hadoop.mapreduce.Job;
74  import org.apache.hadoop.mapreduce.OutputCommitter;
75  import org.apache.hadoop.mapreduce.OutputFormat;
76  import org.apache.hadoop.mapreduce.RecordWriter;
77  import org.apache.hadoop.mapreduce.TaskAttemptContext;
78  import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
79  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
80  import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
81  
82  /**
83   * Writes HFiles. Passed Cells must arrive in order.
84   * Writes current time as the sequence id for the file. Sets the major compacted
85   * attribute on created @{link {@link HFile}s. Calling write(null,null) will forcibly roll
86   * all HFiles being written.
87   * <p>
88   * Using this class as part of a MapReduce job is best done
89   * using {@link #configureIncrementalLoad(Job, Table, RegionLocator)}.
90   */
91  @InterfaceAudience.Public
92  @InterfaceStability.Evolving
93  public class HFileOutputFormat2
94      extends FileOutputFormat<ImmutableBytesWritable, Cell> {
95    private static final Log LOG = LogFactory.getLog(HFileOutputFormat2.class);
96  
97    // The following constants are private since these are used by
98    // HFileOutputFormat2 to internally transfer data between job setup and
99    // reducer run using conf.
100   // These should not be changed by the client.
101   private static final String COMPRESSION_FAMILIES_CONF_KEY =
102       "hbase.hfileoutputformat.families.compression";
103   private static final String BLOOM_TYPE_FAMILIES_CONF_KEY =
104       "hbase.hfileoutputformat.families.bloomtype";
105   private static final String BLOCK_SIZE_FAMILIES_CONF_KEY =
106       "hbase.mapreduce.hfileoutputformat.blocksize";
107   private static final String DATABLOCK_ENCODING_FAMILIES_CONF_KEY =
108       "hbase.mapreduce.hfileoutputformat.families.datablock.encoding";
109 
110   // This constant is public since the client can modify this when setting
111   // up their conf object and thus refer to this symbol.
112   // It is present for backwards compatibility reasons. Use it only to
113   // override the auto-detection of datablock encoding and compression.
114   public static final String DATABLOCK_ENCODING_OVERRIDE_CONF_KEY =
115       "hbase.mapreduce.hfileoutputformat.datablock.encoding";
116   public static final String COMPRESSION_OVERRIDE_CONF_KEY =
117       "hbase.mapreduce.hfileoutputformat.compression";
118 
119   /**
120    * Keep locality while generating HFiles for bulkload. See HBASE-12596
121    */
122   public static final String LOCALITY_SENSITIVE_CONF_KEY =
123       "hbase.bulkload.locality.sensitive.enabled";
124   private static final boolean DEFAULT_LOCALITY_SENSITIVE = true;
125   public static final String OUTPUT_TABLE_NAME_CONF_KEY =
126       "hbase.mapreduce.hfileoutputformat.table.name";
127 
128   public static final String REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY =
129     "hbase.hfileoutputformat.remote.cluster.zookeeper.quorum";
130   public static final String REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY =
131     "hbase.hfileoutputformat.remote.cluster.zookeeper." + HConstants.CLIENT_PORT_STR;
132   public static final String REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY =
133     "hbase.hfileoutputformat.remote.cluster." + HConstants.ZOOKEEPER_ZNODE_PARENT;
134 
135   public static final String STORAGE_POLICY_PROPERTY = HStore.BLOCK_STORAGE_POLICY_KEY;
136   public static final String STORAGE_POLICY_PROPERTY_CF_PREFIX = STORAGE_POLICY_PROPERTY + ".";
137 
138   @Override
139   public RecordWriter<ImmutableBytesWritable, Cell> getRecordWriter(
140       final TaskAttemptContext context) throws IOException, InterruptedException {
141     return createRecordWriter(context, this.getOutputCommitter(context));
142   }
143 
144   static <V extends Cell> RecordWriter<ImmutableBytesWritable, V>
145       createRecordWriter(final TaskAttemptContext context, final OutputCommitter committer)
146           throws IOException {
147 
148     // Get the path of the temporary output file
149     final Path outputdir = ((FileOutputCommitter) committer).getWorkPath();
150     final Configuration conf = context.getConfiguration();
151     final FileSystem fs = outputdir.getFileSystem(conf);
152     // These configs. are from hbase-*.xml
153     final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
154         HConstants.DEFAULT_MAX_FILE_SIZE);
155     // Invented config.  Add to hbase-*.xml if other than default compression.
156     final String defaultCompressionStr = conf.get("hfile.compression",
157         Compression.Algorithm.NONE.getName());
158     final Algorithm defaultCompression = AbstractHFileWriter
159         .compressionByName(defaultCompressionStr);
160     String compressionStr = conf.get(COMPRESSION_OVERRIDE_CONF_KEY);
161     final Algorithm overriddenCompression;
162     if (compressionStr != null) {
163       overriddenCompression = Compression.getCompressionAlgorithmByName(compressionStr);
164     } else {
165       overriddenCompression = null;
166     }
167     final boolean compactionExclude = conf.getBoolean(
168         "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
169 
170     // create a map from column family to the compression algorithm
171     final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf);
172     final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf);
173     final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf);
174 
175     String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
176     final Map<byte[], DataBlockEncoding> datablockEncodingMap
177         = createFamilyDataBlockEncodingMap(conf);
178     final DataBlockEncoding overriddenEncoding;
179     if (dataBlockEncodingStr != null) {
180       overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
181     } else {
182       overriddenEncoding = null;
183     }
184 
185     return new RecordWriter<ImmutableBytesWritable, V>() {
186       // Map of families to writers and how much has been output on the writer.
187       private final Map<byte [], WriterLength> writers =
188         new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
189       private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
190       private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
191       private boolean rollRequested = false;
192 
193       @Override
194       public void write(ImmutableBytesWritable row, V cell)
195           throws IOException {
196         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
197 
198         // null input == user explicitly wants to flush
199         if (row == null && kv == null) {
200           rollWriters();
201           return;
202         }
203 
204         byte [] rowKey = CellUtil.cloneRow(kv);
205         long length = kv.getLength();
206         byte [] family = CellUtil.cloneFamily(kv);
207         WriterLength wl = this.writers.get(family);
208 
209         // If this is a new column family, verify that the directory exists
210         if (wl == null) {
211           Path cfPath = new Path(outputdir, Bytes.toString(family));
212           fs.mkdirs(cfPath);
213           configureStoragePolicy(conf, fs, family, cfPath);
214         }
215 
216         // If any of the HFiles for the column families has reached
217         // maxsize, we need to roll all the writers
218         if (wl != null && wl.written + length >= maxsize) {
219           this.rollRequested = true;
220         }
221 
222         // This can only happen once a row is finished though
223         if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
224           rollWriters();
225         }
226 
227         // create a new WAL writer, if necessary
228         if (wl == null || wl.writer == null) {
229           if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
230             HRegionLocation loc = null;
231             String tableName = conf.get(OUTPUT_TABLE_NAME_CONF_KEY);
232             if (tableName != null) {
233               try (Connection connection = ConnectionFactory.createConnection(
234                 createRemoteClusterConf(conf));
235                      RegionLocator locator =
236                        connection.getRegionLocator(TableName.valueOf(tableName))) {
237                 loc = locator.getRegionLocation(rowKey);
238               } catch (Throwable e) {
239                 LOG.warn("there's something wrong when locating rowkey: " +
240                   Bytes.toString(rowKey), e);
241                 loc = null;
242               }
243             }
244 
245             if (null == loc) {
246               if (LOG.isTraceEnabled()) {
247                 LOG.trace("failed to get region location, so use default writer: "
248                     + Bytes.toString(rowKey));
249               }
250               wl = getNewWriter(family, conf, null);
251             } else {
252               if (LOG.isDebugEnabled()) {
253                 LOG.debug("first rowkey: [" + Bytes.toString(rowKey) + "]");
254               }
255               InetSocketAddress initialIsa =
256                   new InetSocketAddress(loc.getHostname(), loc.getPort());
257               if (initialIsa.isUnresolved()) {
258                 if (LOG.isTraceEnabled()) {
259                   LOG.trace("failed to resolve bind address: " + loc.getHostname() + ":"
260                       + loc.getPort() + ", so use default writer");
261                 }
262                 wl = getNewWriter(family, conf, null);
263               } else {
264                 if (LOG.isDebugEnabled()) {
265                   LOG.debug("use favored nodes writer: " + initialIsa.getHostString());
266                 }
267                 wl = getNewWriter(family, conf, new InetSocketAddress[] { initialIsa });
268               }
269             }
270           } else {
271             wl = getNewWriter(family, conf, null);
272           }
273         }
274 
275         // we now have the proper WAL writer. full steam ahead
276         kv.updateLatestStamp(this.now);
277         wl.writer.append(kv);
278         wl.written += length;
279 
280         // Copy the row so we know when a row transition.
281         this.previousRow = rowKey;
282       }
283 
284       private void rollWriters() throws IOException {
285         for (WriterLength wl : this.writers.values()) {
286           if (wl.writer != null) {
287             LOG.info("Writer=" + wl.writer.getPath() +
288                 ((wl.written == 0)? "": ", wrote=" + wl.written));
289             close(wl.writer);
290           }
291           wl.writer = null;
292           wl.written = 0;
293         }
294         this.rollRequested = false;
295       }
296 
297       private Configuration createRemoteClusterConf(Configuration conf) {
298         final Configuration newConf = new Configuration(conf);
299 
300         final String quorum = conf.get(REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY);
301         final String clientPort = conf.get(REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY);
302         final String parent = conf.get(REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY);
303 
304         if (quorum != null && clientPort != null && parent != null) {
305           newConf.set(HConstants.ZOOKEEPER_QUORUM, quorum);
306           newConf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.parseInt(clientPort));
307           newConf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, parent);
308         }
309 
310         return newConf;
311       }
312 
313       /* Create a new StoreFile.Writer.
314        * @param family
315        * @return A WriterLength, containing a new StoreFile.Writer.
316        * @throws IOException
317        */
318       @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="BX_UNBOXING_IMMEDIATELY_REBOXED",
319           justification="Not important")
320       private WriterLength getNewWriter(byte[] family, Configuration conf,
321           InetSocketAddress[] favoredNodes)
322           throws IOException {
323         WriterLength wl = new WriterLength();
324         Path familydir = new Path(outputdir, Bytes.toString(family));
325         Algorithm compression = overriddenCompression;
326         compression = compression == null ? compressionMap.get(family) : compression;
327         compression = compression == null ? defaultCompression : compression;
328         BloomType bloomType = bloomTypeMap.get(family);
329         bloomType = bloomType == null ? BloomType.NONE : bloomType;
330         Integer blockSize = blockSizeMap.get(family);
331         blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
332         DataBlockEncoding encoding = overriddenEncoding;
333         encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
334         encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
335         Configuration tempConf = new Configuration(conf);
336         tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
337         HFileContextBuilder contextBuilder = new HFileContextBuilder()
338                                     .withCompression(compression)
339                                     .withChecksumType(HStore.getChecksumType(conf))
340                                     .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf))
341                                     .withBlockSize(blockSize)
342                                     .withColumnFamily(family);
343         if (HFile.getFormatVersion(conf) >= HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
344           contextBuilder.withIncludesTags(true);
345         }
346 
347         contextBuilder.withDataBlockEncoding(encoding);
348         HFileContext hFileContext = contextBuilder.build();
349 
350         if (null == favoredNodes) {
351           wl.writer =
352               new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs)
353                   .withOutputDir(familydir).withBloomType(bloomType)
354                   .withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext).build();
355         } else {
356           wl.writer =
357               new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), new HFileSystem(fs))
358                   .withOutputDir(familydir).withBloomType(bloomType)
359                   .withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext)
360                   .withFavoredNodes(favoredNodes).build();
361         }
362 
363         this.writers.put(family, wl);
364         return wl;
365       }
366 
367       private void close(final StoreFile.Writer w) throws IOException {
368         if (w != null) {
369           w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
370               Bytes.toBytes(System.currentTimeMillis()));
371           w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
372               Bytes.toBytes(context.getTaskAttemptID().toString()));
373           w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
374               Bytes.toBytes(true));
375           w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
376               Bytes.toBytes(compactionExclude));
377           w.appendTrackedTimestampsToMetadata();
378           w.close();
379         }
380       }
381 
382       @Override
383       public void close(TaskAttemptContext c)
384       throws IOException, InterruptedException {
385         for (WriterLength wl: this.writers.values()) {
386           close(wl.writer);
387         }
388       }
389     };
390   }
391 
392   /**
393    * Configure block storage policy for CF after the directory is created.
394    */
395   static void configureStoragePolicy(final Configuration conf, final FileSystem fs,
396       byte[] family, Path cfPath) {
397     if (null == conf || null == fs || null == family || null == cfPath) {
398       return;
399     }
400     String policy =
401         conf.get(STORAGE_POLICY_PROPERTY_CF_PREFIX + Bytes.toString(family),
402           conf.get(STORAGE_POLICY_PROPERTY));
403 
404     FSUtils.setStoragePolicy(fs, cfPath, policy);
405   }
406 
407   /*
408    * Data structure to hold a Writer and amount of data written on it.
409    */
410   static class WriterLength {
411     long written = 0;
412     StoreFile.Writer writer = null;
413   }
414 
415   /**
416    * Return the start keys of all of the regions in this table,
417    * as a list of ImmutableBytesWritable.
418    */
419   private static List<ImmutableBytesWritable> getRegionStartKeys(RegionLocator table)
420   throws IOException {
421     byte[][] byteKeys = table.getStartKeys();
422     ArrayList<ImmutableBytesWritable> ret =
423       new ArrayList<ImmutableBytesWritable>(byteKeys.length);
424     for (byte[] byteKey : byteKeys) {
425       ret.add(new ImmutableBytesWritable(byteKey));
426     }
427     return ret;
428   }
429 
430   /**
431    * Write out a {@link SequenceFile} that can be read by
432    * {@link TotalOrderPartitioner} that contains the split points in startKeys.
433    */
434   @SuppressWarnings("deprecation")
435   private static void writePartitions(Configuration conf, Path partitionsPath,
436       List<ImmutableBytesWritable> startKeys) throws IOException {
437     LOG.info("Writing partition information to " + partitionsPath);
438     if (startKeys.isEmpty()) {
439       throw new IllegalArgumentException("No regions passed");
440     }
441 
442     // We're generating a list of split points, and we don't ever
443     // have keys < the first region (which has an empty start key)
444     // so we need to remove it. Otherwise we would end up with an
445     // empty reducer with index 0
446     TreeSet<ImmutableBytesWritable> sorted =
447       new TreeSet<ImmutableBytesWritable>(startKeys);
448 
449     ImmutableBytesWritable first = sorted.first();
450     if (!Bytes.equals(first.get(), HConstants.EMPTY_BYTE_ARRAY)) {
451       throw new IllegalArgumentException(
452           "First region of table should have empty start key. Instead has: "
453           + Bytes.toStringBinary(first.get()));
454     }
455     sorted.remove(first);
456 
457     // Write the actual file
458     FileSystem fs = partitionsPath.getFileSystem(conf);
459     SequenceFile.Writer writer = SequenceFile.createWriter(
460       fs, conf, partitionsPath, ImmutableBytesWritable.class,
461       NullWritable.class);
462 
463     try {
464       for (ImmutableBytesWritable startKey : sorted) {
465         writer.append(startKey, NullWritable.get());
466       }
467     } finally {
468       writer.close();
469     }
470   }
471 
472   /**
473    * Configure a MapReduce Job to perform an incremental load into the given
474    * table. This
475    * <ul>
476    *   <li>Inspects the table to configure a total order partitioner</li>
477    *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
478    *   <li>Sets the number of reduce tasks to match the current number of regions</li>
479    *   <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li>
480    *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
481    *     PutSortReducer)</li>
482    * </ul>
483    * The user should be sure to set the map output value class to either KeyValue or Put before
484    * running this function.
485    * 
486    * @deprecated Use {@link #configureIncrementalLoad(Job, Table, RegionLocator)} instead.
487    */
488   @Deprecated
489   public static void configureIncrementalLoad(Job job, HTable table)
490       throws IOException {
491     configureIncrementalLoad(job, table.getTableDescriptor(), table.getRegionLocator());
492   }
493 
494   /**
495    * Configure a MapReduce Job to perform an incremental load into the given
496    * table. This
497    * <ul>
498    *   <li>Inspects the table to configure a total order partitioner</li>
499    *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
500    *   <li>Sets the number of reduce tasks to match the current number of regions</li>
501    *   <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li>
502    *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
503    *     PutSortReducer)</li>
504    *   <li>Sets the HBase cluster key to load region locations for locality-sensitive</li>
505    * </ul>
506    * The user should be sure to set the map output value class to either KeyValue or Put before
507    * running this function.
508    */
509   public static void configureIncrementalLoad(Job job, Table table, RegionLocator regionLocator)
510       throws IOException {
511     configureIncrementalLoad(job, table.getTableDescriptor(), regionLocator);
512     configureRemoteCluster(job, table.getConfiguration());
513   }
514 
515   /**
516    * Configure a MapReduce Job to perform an incremental load into the given
517    * table. This
518    * <ul>
519    *   <li>Inspects the table to configure a total order partitioner</li>
520    *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
521    *   <li>Sets the number of reduce tasks to match the current number of regions</li>
522    *   <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li>
523    *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
524    *     PutSortReducer)</li>
525    * </ul>
526    * The user should be sure to set the map output value class to either KeyValue or Put before
527    * running this function.
528    */
529   public static void configureIncrementalLoad(Job job, HTableDescriptor tableDescriptor,
530       RegionLocator regionLocator) throws IOException {
531     configureIncrementalLoad(job, tableDescriptor, regionLocator, HFileOutputFormat2.class);
532   }
533 
534   static void configureIncrementalLoad(Job job, HTableDescriptor tableDescriptor,
535       RegionLocator regionLocator, Class<? extends OutputFormat<?, ?>> cls) throws IOException,
536       UnsupportedEncodingException {
537     Configuration conf = job.getConfiguration();
538     job.setOutputKeyClass(ImmutableBytesWritable.class);
539     job.setOutputValueClass(KeyValue.class);
540     job.setOutputFormatClass(cls);
541 
542     // Based on the configured map output class, set the correct reducer to properly
543     // sort the incoming values.
544     // TODO it would be nice to pick one or the other of these formats.
545     if (KeyValue.class.equals(job.getMapOutputValueClass())) {
546       job.setReducerClass(KeyValueSortReducer.class);
547     } else if (Put.class.equals(job.getMapOutputValueClass())) {
548       job.setReducerClass(PutSortReducer.class);
549     } else if (Text.class.equals(job.getMapOutputValueClass())) {
550       job.setReducerClass(TextSortReducer.class);
551     } else {
552       LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
553     }
554 
555     conf.setStrings("io.serializations", conf.get("io.serializations"),
556         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
557         KeyValueSerialization.class.getName());
558 
559     if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
560       // record this table name for creating writer by favored nodes
561       LOG.info("bulkload locality sensitive enabled");
562       conf.set(OUTPUT_TABLE_NAME_CONF_KEY, regionLocator.getName().getNameAsString());
563     }
564 
565     // Use table's region boundaries for TOP split points.
566     LOG.info("Looking up current regions for table " + tableDescriptor.getTableName());
567     List<ImmutableBytesWritable> startKeys = getRegionStartKeys(regionLocator);
568     LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
569         "to match current region count");
570     job.setNumReduceTasks(startKeys.size());
571 
572     configurePartitioner(job, startKeys);
573     // Set compression algorithms based on column families
574     configureCompression(conf, tableDescriptor);
575     configureBloomType(tableDescriptor, conf);
576     configureBlockSize(tableDescriptor, conf);
577     configureDataBlockEncoding(tableDescriptor, conf);
578 
579     TableMapReduceUtil.addDependencyJars(job);
580     TableMapReduceUtil.initCredentials(job);
581     LOG.info("Incremental table " + regionLocator.getName() + " output configured.");
582   }
583   
584   public static void configureIncrementalLoadMap(Job job, Table table) throws IOException {
585     Configuration conf = job.getConfiguration();
586 
587     job.setOutputKeyClass(ImmutableBytesWritable.class);
588     job.setOutputValueClass(KeyValue.class);
589     job.setOutputFormatClass(HFileOutputFormat2.class);
590 
591     // Set compression algorithms based on column families
592     configureCompression(conf, table.getTableDescriptor());
593     configureBloomType(table.getTableDescriptor(), conf);
594     configureBlockSize(table.getTableDescriptor(), conf);
595     HTableDescriptor tableDescriptor = table.getTableDescriptor();
596     configureDataBlockEncoding(tableDescriptor, conf);
597 
598     TableMapReduceUtil.addDependencyJars(job);
599     TableMapReduceUtil.initCredentials(job);
600     LOG.info("Incremental table " + table.getName() + " output configured.");
601   }
602 
603   /**
604    * Configure HBase cluster key for remote cluster to load region location for locality-sensitive
605    * if it's enabled.
606    * It's not necessary to call this method explicitly when the cluster key for HBase cluster to be
607    * used to load region location is configured in the job configuration.
608    * Call this method when another HBase cluster key is configured in the job configuration.
609    * For example, you should call when you load data from HBase cluster A using
610    * {@link TableInputFormat} and generate hfiles for HBase cluster B.
611    * Otherwise, HFileOutputFormat2 fetch location from cluster A and locality-sensitive won't
612    * working correctly.
613    * {@link #configureIncrementalLoad(Job, Table, RegionLocator)} calls this method using
614    * {@link Table#getConfiguration} as clusterConf.
615    * See HBASE-25608.
616    *
617    * @param job which has configuration to be updated
618    * @param clusterConf which contains cluster key of the HBase cluster to be locality-sensitive
619    *
620    * @see #configureIncrementalLoad(Job, Table, RegionLocator)
621    * @see #LOCALITY_SENSITIVE_CONF_KEY
622    * @see #REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY
623    * @see #REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY
624    * @see #REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY
625    */
626   public static void configureRemoteCluster(Job job, Configuration clusterConf) {
627     Configuration conf = job.getConfiguration();
628 
629     if (!conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
630       return;
631     }
632 
633     final String quorum = clusterConf.get(HConstants.ZOOKEEPER_QUORUM);
634     final int clientPort = clusterConf.getInt(
635       HConstants.ZOOKEEPER_CLIENT_PORT, HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT);
636     final String parent = clusterConf.get(
637       HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT);
638 
639     conf.set(REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY, quorum);
640     conf.setInt(REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY, clientPort);
641     conf.set(REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY, parent);
642 
643     LOG.info("ZK configs for remote cluster of bulkload is configured: " +
644       quorum + ":" + clientPort + "/" + parent);
645   }
646 
647   /**
648    * Runs inside the task to deserialize column family to compression algorithm
649    * map from the configuration.
650    *
651    * @param conf to read the serialized values from
652    * @return a map from column family to the configured compression algorithm
653    */
654   @InterfaceAudience.Private
655   static Map<byte[], Algorithm> createFamilyCompressionMap(Configuration
656       conf) {
657     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
658         COMPRESSION_FAMILIES_CONF_KEY);
659     Map<byte[], Algorithm> compressionMap = new TreeMap<byte[],
660         Algorithm>(Bytes.BYTES_COMPARATOR);
661     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
662       Algorithm algorithm = AbstractHFileWriter.compressionByName
663           (e.getValue());
664       compressionMap.put(e.getKey(), algorithm);
665     }
666     return compressionMap;
667   }
668 
669   /**
670    * Runs inside the task to deserialize column family to bloom filter type
671    * map from the configuration.
672    *
673    * @param conf to read the serialized values from
674    * @return a map from column family to the the configured bloom filter type
675    */
676   @InterfaceAudience.Private
677   static Map<byte[], BloomType> createFamilyBloomTypeMap(Configuration conf) {
678     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
679         BLOOM_TYPE_FAMILIES_CONF_KEY);
680     Map<byte[], BloomType> bloomTypeMap = new TreeMap<byte[],
681         BloomType>(Bytes.BYTES_COMPARATOR);
682     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
683       BloomType bloomType = BloomType.valueOf(e.getValue());
684       bloomTypeMap.put(e.getKey(), bloomType);
685     }
686     return bloomTypeMap;
687   }
688 
689   /**
690    * Runs inside the task to deserialize column family to block size
691    * map from the configuration.
692    *
693    * @param conf to read the serialized values from
694    * @return a map from column family to the configured block size
695    */
696   @InterfaceAudience.Private
697   static Map<byte[], Integer> createFamilyBlockSizeMap(Configuration conf) {
698     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
699         BLOCK_SIZE_FAMILIES_CONF_KEY);
700     Map<byte[], Integer> blockSizeMap = new TreeMap<byte[],
701         Integer>(Bytes.BYTES_COMPARATOR);
702     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
703       Integer blockSize = Integer.parseInt(e.getValue());
704       blockSizeMap.put(e.getKey(), blockSize);
705     }
706     return blockSizeMap;
707   }
708 
709   /**
710    * Runs inside the task to deserialize column family to data block encoding
711    * type map from the configuration.
712    *
713    * @param conf to read the serialized values from
714    * @return a map from column family to HFileDataBlockEncoder for the
715    *         configured data block type for the family
716    */
717   @InterfaceAudience.Private
718   static Map<byte[], DataBlockEncoding> createFamilyDataBlockEncodingMap(
719       Configuration conf) {
720     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
721         DATABLOCK_ENCODING_FAMILIES_CONF_KEY);
722     Map<byte[], DataBlockEncoding> encoderMap = new TreeMap<byte[],
723         DataBlockEncoding>(Bytes.BYTES_COMPARATOR);
724     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
725       encoderMap.put(e.getKey(), DataBlockEncoding.valueOf((e.getValue())));
726     }
727     return encoderMap;
728   }
729 
730 
731   /**
732    * Run inside the task to deserialize column family to given conf value map.
733    *
734    * @param conf to read the serialized values from
735    * @param confName conf key to read from the configuration
736    * @return a map of column family to the given configuration value
737    */
738   private static Map<byte[], String> createFamilyConfValueMap(
739       Configuration conf, String confName) {
740     Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
741     String confVal = conf.get(confName, "");
742     for (String familyConf : confVal.split("&")) {
743       String[] familySplit = familyConf.split("=");
744       if (familySplit.length != 2) {
745         continue;
746       }
747       try {
748         confValMap.put(Bytes.toBytes(URLDecoder.decode(familySplit[0], "UTF-8")),
749             URLDecoder.decode(familySplit[1], "UTF-8"));
750       } catch (UnsupportedEncodingException e) {
751         // will not happen with UTF-8 encoding
752         throw new AssertionError(e);
753       }
754     }
755     return confValMap;
756   }
757 
758   /**
759    * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
760    * <code>splitPoints</code>. Cleans up the partitions file after job exists.
761    */
762   static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
763       throws IOException {
764     Configuration conf = job.getConfiguration();
765     // create the partitions file
766     FileSystem fs = FileSystem.get(conf);
767     String hbaseTmpFsDir =
768         conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY,
769           HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);
770     Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID());
771     fs.makeQualified(partitionsPath);
772     writePartitions(conf, partitionsPath, splitPoints);
773     fs.deleteOnExit(partitionsPath);
774 
775     // configure job to use it
776     job.setPartitionerClass(TotalOrderPartitioner.class);
777     TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
778   }
779 
780   /**
781    * Serialize column family to compression algorithm map to configuration.
782    * Invoked while configuring the MR job for incremental load.
783    *
784    * @param table to read the properties from
785    * @param conf to persist serialized values into
786    * @throws IOException
787    *           on failure to read column family descriptors
788    */
789   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
790       value="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
791   @InterfaceAudience.Private
792   static void configureCompression(Configuration conf, HTableDescriptor tableDescriptor)
793       throws UnsupportedEncodingException {
794     StringBuilder compressionConfigValue = new StringBuilder();
795     if(tableDescriptor == null){
796       // could happen with mock table instance
797       return;
798     }
799     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
800     int i = 0;
801     for (HColumnDescriptor familyDescriptor : families) {
802       if (i++ > 0) {
803         compressionConfigValue.append('&');
804       }
805       compressionConfigValue.append(URLEncoder.encode(
806         familyDescriptor.getNameAsString(), "UTF-8"));
807       compressionConfigValue.append('=');
808       compressionConfigValue.append(URLEncoder.encode(
809         familyDescriptor.getCompression().getName(), "UTF-8"));
810     }
811     // Get rid of the last ampersand
812     conf.set(COMPRESSION_FAMILIES_CONF_KEY, compressionConfigValue.toString());
813   }
814 
815   /**
816    * Serialize column family to block size map to configuration.
817    * Invoked while configuring the MR job for incremental load.
818    * @param tableDescriptor to read the properties from
819    * @param conf to persist serialized values into
820    *
821    * @throws IOException
822    *           on failure to read column family descriptors
823    */
824   @InterfaceAudience.Private
825   static void configureBlockSize(HTableDescriptor tableDescriptor, Configuration conf)
826       throws UnsupportedEncodingException {
827     StringBuilder blockSizeConfigValue = new StringBuilder();
828     if (tableDescriptor == null) {
829       // could happen with mock table instance
830       return;
831     }
832     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
833     int i = 0;
834     for (HColumnDescriptor familyDescriptor : families) {
835       if (i++ > 0) {
836         blockSizeConfigValue.append('&');
837       }
838       blockSizeConfigValue.append(URLEncoder.encode(
839           familyDescriptor.getNameAsString(), "UTF-8"));
840       blockSizeConfigValue.append('=');
841       blockSizeConfigValue.append(URLEncoder.encode(
842           String.valueOf(familyDescriptor.getBlocksize()), "UTF-8"));
843     }
844     // Get rid of the last ampersand
845     conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, blockSizeConfigValue.toString());
846   }
847 
848   /**
849    * Serialize column family to bloom type map to configuration.
850    * Invoked while configuring the MR job for incremental load.
851    * @param tableDescriptor to read the properties from
852    * @param conf to persist serialized values into
853    *
854    * @throws IOException
855    *           on failure to read column family descriptors
856    */
857   @InterfaceAudience.Private
858   static void configureBloomType(HTableDescriptor tableDescriptor, Configuration conf)
859       throws UnsupportedEncodingException {
860     if (tableDescriptor == null) {
861       // could happen with mock table instance
862       return;
863     }
864     StringBuilder bloomTypeConfigValue = new StringBuilder();
865     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
866     int i = 0;
867     for (HColumnDescriptor familyDescriptor : families) {
868       if (i++ > 0) {
869         bloomTypeConfigValue.append('&');
870       }
871       bloomTypeConfigValue.append(URLEncoder.encode(
872         familyDescriptor.getNameAsString(), "UTF-8"));
873       bloomTypeConfigValue.append('=');
874       String bloomType = familyDescriptor.getBloomFilterType().toString();
875       if (bloomType == null) {
876         bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
877       }
878       bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
879     }
880     conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, bloomTypeConfigValue.toString());
881   }
882 
883   /**
884    * Serialize column family to data block encoding map to configuration.
885    * Invoked while configuring the MR job for incremental load.
886    *
887    * @param table to read the properties from
888    * @param conf to persist serialized values into
889    * @throws IOException
890    *           on failure to read column family descriptors
891    */
892   @InterfaceAudience.Private
893   static void configureDataBlockEncoding(HTableDescriptor tableDescriptor,
894       Configuration conf) throws UnsupportedEncodingException {
895     if (tableDescriptor == null) {
896       // could happen with mock table instance
897       return;
898     }
899     StringBuilder dataBlockEncodingConfigValue = new StringBuilder();
900     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
901     int i = 0;
902     for (HColumnDescriptor familyDescriptor : families) {
903       if (i++ > 0) {
904         dataBlockEncodingConfigValue.append('&');
905       }
906       dataBlockEncodingConfigValue.append(
907           URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
908       dataBlockEncodingConfigValue.append('=');
909       DataBlockEncoding encoding = familyDescriptor.getDataBlockEncoding();
910       if (encoding == null) {
911         encoding = DataBlockEncoding.NONE;
912       }
913       dataBlockEncodingConfigValue.append(URLEncoder.encode(encoding.toString(),
914           "UTF-8"));
915     }
916     conf.set(DATABLOCK_ENCODING_FAMILIES_CONF_KEY,
917         dataBlockEncodingConfigValue.toString());
918   }
919 }