001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.mapreduce;
020
021import java.io.File;
022import java.io.IOException;
023import java.net.URL;
024import java.net.URLDecoder;
025import java.util.ArrayList;
026import java.util.Base64;
027import java.util.Collection;
028import java.util.Enumeration;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.List;
032import java.util.Map;
033import java.util.Set;
034import java.util.zip.ZipEntry;
035import java.util.zip.ZipFile;
036
037import org.apache.hadoop.conf.Configuration;
038import org.apache.hadoop.fs.FileSystem;
039import org.apache.hadoop.fs.Path;
040import org.apache.hadoop.hbase.HBaseConfiguration;
041import org.apache.hadoop.hbase.HConstants;
042import org.apache.hadoop.hbase.TableName;
043import org.apache.yetus.audience.InterfaceAudience;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046import org.apache.hadoop.hbase.client.Connection;
047import org.apache.hadoop.hbase.client.ConnectionFactory;
048import org.apache.hadoop.hbase.client.Put;
049import org.apache.hadoop.hbase.client.RegionLocator;
050import org.apache.hadoop.hbase.client.Scan;
051import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
052import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
053import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
054import org.apache.hadoop.hbase.security.User;
055import org.apache.hadoop.hbase.security.UserProvider;
056import org.apache.hadoop.hbase.security.token.TokenUtil;
057import org.apache.hadoop.hbase.util.Bytes;
058import org.apache.hadoop.hbase.util.RegionSplitter;
059import org.apache.hadoop.hbase.zookeeper.ZKConfig;
060import org.apache.hadoop.io.Writable;
061import org.apache.hadoop.mapreduce.InputFormat;
062import org.apache.hadoop.mapreduce.Job;
063import org.apache.hadoop.util.StringUtils;
064
065import com.codahale.metrics.MetricRegistry;
066
067/**
068 * Utility for {@link TableMapper} and {@link TableReducer}
069 */
070@SuppressWarnings({ "rawtypes", "unchecked" })
071@InterfaceAudience.Public
072public class TableMapReduceUtil {
073  private static final Logger LOG = LoggerFactory.getLogger(TableMapReduceUtil.class);
074  public static final String TABLE_INPUT_CLASS_KEY = "hbase.table.input.class";
075
076  /**
077   * Use this before submitting a TableMap job. It will appropriately set up
078   * the job.
079   *
080   * @param table  The table name to read from.
081   * @param scan  The scan instance with the columns, time range etc.
082   * @param mapper  The mapper class to use.
083   * @param outputKeyClass  The class of the output key.
084   * @param outputValueClass  The class of the output value.
085   * @param job  The current job to adjust.  Make sure the passed job is
086   * carrying all necessary HBase configuration.
087   * @throws IOException When setting up the details fails.
088   */
089  public static void initTableMapperJob(String table, Scan scan,
090      Class<? extends TableMapper> mapper,
091      Class<?> outputKeyClass,
092      Class<?> outputValueClass, Job job)
093  throws IOException {
094    initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
095        job, true);
096  }
097
098
099  /**
100   * Use this before submitting a TableMap job. It will appropriately set up
101   * the job.
102   *
103   * @param table  The table name to read from.
104   * @param scan  The scan instance with the columns, time range etc.
105   * @param mapper  The mapper class to use.
106   * @param outputKeyClass  The class of the output key.
107   * @param outputValueClass  The class of the output value.
108   * @param job  The current job to adjust.  Make sure the passed job is
109   * carrying all necessary HBase configuration.
110   * @throws IOException When setting up the details fails.
111   */
112  public static void initTableMapperJob(TableName table,
113      Scan scan,
114      Class<? extends TableMapper> mapper,
115      Class<?> outputKeyClass,
116      Class<?> outputValueClass,
117      Job job) throws IOException {
118    initTableMapperJob(table.getNameAsString(),
119        scan,
120        mapper,
121        outputKeyClass,
122        outputValueClass,
123        job,
124        true);
125  }
126
127  /**
128   * Use this before submitting a TableMap job. It will appropriately set up
129   * the job.
130   *
131   * @param table Binary representation of the table name to read from.
132   * @param scan  The scan instance with the columns, time range etc.
133   * @param mapper  The mapper class to use.
134   * @param outputKeyClass  The class of the output key.
135   * @param outputValueClass  The class of the output value.
136   * @param job  The current job to adjust.  Make sure the passed job is
137   * carrying all necessary HBase configuration.
138   * @throws IOException When setting up the details fails.
139   */
140   public static void initTableMapperJob(byte[] table, Scan scan,
141      Class<? extends TableMapper> mapper,
142      Class<?> outputKeyClass,
143      Class<?> outputValueClass, Job job)
144  throws IOException {
145      initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
146              job, true);
147  }
148
149   /**
150    * Use this before submitting a TableMap job. It will appropriately set up
151    * the job.
152    *
153    * @param table  The table name to read from.
154    * @param scan  The scan instance with the columns, time range etc.
155    * @param mapper  The mapper class to use.
156    * @param outputKeyClass  The class of the output key.
157    * @param outputValueClass  The class of the output value.
158    * @param job  The current job to adjust.  Make sure the passed job is
159    * carrying all necessary HBase configuration.
160    * @param addDependencyJars upload HBase jars and jars for any of the configured
161    *           job classes via the distributed cache (tmpjars).
162    * @throws IOException When setting up the details fails.
163    */
164   public static void initTableMapperJob(String table, Scan scan,
165       Class<? extends TableMapper> mapper,
166       Class<?> outputKeyClass,
167       Class<?> outputValueClass, Job job,
168       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
169   throws IOException {
170     initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass, job,
171         addDependencyJars, true, inputFormatClass);
172   }
173
174
175  /**
176   * Use this before submitting a TableMap job. It will appropriately set up
177   * the job.
178   *
179   * @param table  The table name to read from.
180   * @param scan  The scan instance with the columns, time range etc.
181   * @param mapper  The mapper class to use.
182   * @param outputKeyClass  The class of the output key.
183   * @param outputValueClass  The class of the output value.
184   * @param job  The current job to adjust.  Make sure the passed job is
185   * carrying all necessary HBase configuration.
186   * @param addDependencyJars upload HBase jars and jars for any of the configured
187   *           job classes via the distributed cache (tmpjars).
188   * @param initCredentials whether to initialize hbase auth credentials for the job
189   * @param inputFormatClass the input format
190   * @throws IOException When setting up the details fails.
191   */
192  public static void initTableMapperJob(String table, Scan scan,
193      Class<? extends TableMapper> mapper,
194      Class<?> outputKeyClass,
195      Class<?> outputValueClass, Job job,
196      boolean addDependencyJars, boolean initCredentials,
197      Class<? extends InputFormat> inputFormatClass)
198  throws IOException {
199    job.setInputFormatClass(inputFormatClass);
200    if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
201    if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
202    job.setMapperClass(mapper);
203    if (Put.class.equals(outputValueClass)) {
204      job.setCombinerClass(PutCombiner.class);
205    }
206    Configuration conf = job.getConfiguration();
207    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
208    conf.set(TableInputFormat.INPUT_TABLE, table);
209    conf.set(TableInputFormat.SCAN, convertScanToString(scan));
210    conf.setStrings("io.serializations", conf.get("io.serializations"),
211        MutationSerialization.class.getName(), ResultSerialization.class.getName(),
212        CellSerialization.class.getName());
213    if (addDependencyJars) {
214      addDependencyJars(job);
215    }
216    if (initCredentials) {
217      initCredentials(job);
218    }
219  }
220
221  /**
222   * Use this before submitting a TableMap job. It will appropriately set up
223   * the job.
224   *
225   * @param table Binary representation of the table name to read from.
226   * @param scan  The scan instance with the columns, time range etc.
227   * @param mapper  The mapper class to use.
228   * @param outputKeyClass  The class of the output key.
229   * @param outputValueClass  The class of the output value.
230   * @param job  The current job to adjust.  Make sure the passed job is
231   * carrying all necessary HBase configuration.
232   * @param addDependencyJars upload HBase jars and jars for any of the configured
233   *           job classes via the distributed cache (tmpjars).
234   * @param inputFormatClass The class of the input format
235   * @throws IOException When setting up the details fails.
236   */
237  public static void initTableMapperJob(byte[] table, Scan scan,
238      Class<? extends TableMapper> mapper,
239      Class<?> outputKeyClass,
240      Class<?> outputValueClass, Job job,
241      boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
242  throws IOException {
243      initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
244              outputValueClass, job, addDependencyJars, inputFormatClass);
245  }
246
247  /**
248   * Use this before submitting a TableMap job. It will appropriately set up
249   * the job.
250   *
251   * @param table Binary representation of the table name to read from.
252   * @param scan  The scan instance with the columns, time range etc.
253   * @param mapper  The mapper class to use.
254   * @param outputKeyClass  The class of the output key.
255   * @param outputValueClass  The class of the output value.
256   * @param job  The current job to adjust.  Make sure the passed job is
257   * carrying all necessary HBase configuration.
258   * @param addDependencyJars upload HBase jars and jars for any of the configured
259   *           job classes via the distributed cache (tmpjars).
260   * @throws IOException When setting up the details fails.
261   */
262  public static void initTableMapperJob(byte[] table, Scan scan,
263      Class<? extends TableMapper> mapper,
264      Class<?> outputKeyClass,
265      Class<?> outputValueClass, Job job,
266      boolean addDependencyJars)
267  throws IOException {
268      initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass, job,
269        addDependencyJars, getConfiguredInputFormat(job));
270  }
271
272  /**
273   * @return {@link TableInputFormat} .class unless Configuration has something else at
274   *   {@link #TABLE_INPUT_CLASS_KEY}.
275   */
276  private static Class<? extends InputFormat> getConfiguredInputFormat(Job job) {
277    return (Class<? extends InputFormat>)job.getConfiguration().
278      getClass(TABLE_INPUT_CLASS_KEY, TableInputFormat.class);
279  }
280
281  /**
282   * Use this before submitting a TableMap job. It will appropriately set up
283   * the job.
284   *
285   * @param table The table name to read from.
286   * @param scan  The scan instance with the columns, time range etc.
287   * @param mapper  The mapper class to use.
288   * @param outputKeyClass  The class of the output key.
289   * @param outputValueClass  The class of the output value.
290   * @param job  The current job to adjust.  Make sure the passed job is
291   * carrying all necessary HBase configuration.
292   * @param addDependencyJars upload HBase jars and jars for any of the configured
293   *           job classes via the distributed cache (tmpjars).
294   * @throws IOException When setting up the details fails.
295   */
296  public static void initTableMapperJob(String table, Scan scan,
297      Class<? extends TableMapper> mapper,
298      Class<?> outputKeyClass,
299      Class<?> outputValueClass, Job job,
300      boolean addDependencyJars)
301  throws IOException {
302      initTableMapperJob(table, scan, mapper, outputKeyClass,
303              outputValueClass, job, addDependencyJars, getConfiguredInputFormat(job));
304  }
305
306  /**
307   * Enable a basic on-heap cache for these jobs. Any BlockCache implementation based on
308   * direct memory will likely cause the map tasks to OOM when opening the region. This
309   * is done here instead of in TableSnapshotRegionRecordReader in case an advanced user
310   * wants to override this behavior in their job.
311   */
312  public static void resetCacheConfig(Configuration conf) {
313    conf.setFloat(
314      HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT);
315    conf.setFloat(HConstants.BUCKET_CACHE_SIZE_KEY, 0f);
316    conf.unset(HConstants.BUCKET_CACHE_IOENGINE_KEY);
317  }
318
319  /**
320   * Sets up the job for reading from one or more table snapshots, with one or more scans
321   * per snapshot.
322   * It bypasses hbase servers and read directly from snapshot files.
323   *
324   * @param snapshotScans     map of snapshot name to scans on that snapshot.
325   * @param mapper            The mapper class to use.
326   * @param outputKeyClass    The class of the output key.
327   * @param outputValueClass  The class of the output value.
328   * @param job               The current job to adjust.  Make sure the passed job is
329   *                          carrying all necessary HBase configuration.
330   * @param addDependencyJars upload HBase jars and jars for any of the configured
331   *                          job classes via the distributed cache (tmpjars).
332   */
333  public static void initMultiTableSnapshotMapperJob(Map<String, Collection<Scan>> snapshotScans,
334      Class<? extends TableMapper> mapper, Class<?> outputKeyClass, Class<?> outputValueClass,
335      Job job, boolean addDependencyJars, Path tmpRestoreDir) throws IOException {
336    MultiTableSnapshotInputFormat.setInput(job.getConfiguration(), snapshotScans, tmpRestoreDir);
337
338    job.setInputFormatClass(MultiTableSnapshotInputFormat.class);
339    if (outputValueClass != null) {
340      job.setMapOutputValueClass(outputValueClass);
341    }
342    if (outputKeyClass != null) {
343      job.setMapOutputKeyClass(outputKeyClass);
344    }
345    job.setMapperClass(mapper);
346    Configuration conf = job.getConfiguration();
347    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
348
349    if (addDependencyJars) {
350      addDependencyJars(job);
351      addDependencyJarsForClasses(job.getConfiguration(), MetricRegistry.class);
352    }
353
354    resetCacheConfig(job.getConfiguration());
355  }
356
357  /**
358   * Sets up the job for reading from a table snapshot. It bypasses hbase servers and read directly
359   * from snapshot files.
360   * @param snapshotName The name of the snapshot (of a table) to read from.
361   * @param scan The scan instance with the columns, time range etc.
362   * @param mapper The mapper class to use.
363   * @param outputKeyClass The class of the output key.
364   * @param outputValueClass The class of the output value.
365   * @param job The current job to adjust. Make sure the passed job is carrying all necessary HBase
366   *          configuration.
367   * @param addDependencyJars upload HBase jars and jars for any of the configured job classes via
368   *          the distributed cache (tmpjars).
369   * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should
370   *          have write permissions to this directory, and this should not be a subdirectory of
371   *          rootdir. After the job is finished, restore directory can be deleted.
372   * @throws IOException When setting up the details fails.
373   * @see TableSnapshotInputFormat
374   */
375  public static void initTableSnapshotMapperJob(String snapshotName, Scan scan,
376      Class<? extends TableMapper> mapper,
377      Class<?> outputKeyClass,
378      Class<?> outputValueClass, Job job,
379      boolean addDependencyJars, Path tmpRestoreDir)
380      throws IOException {
381    TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir);
382    initTableMapperJob(snapshotName, scan, mapper, outputKeyClass, outputValueClass, job,
383      addDependencyJars, false, TableSnapshotInputFormat.class);
384    resetCacheConfig(job.getConfiguration());
385  }
386
387  /**
388   * Sets up the job for reading from a table snapshot. It bypasses hbase servers
389   * and read directly from snapshot files.
390   *
391   * @param snapshotName The name of the snapshot (of a table) to read from.
392   * @param scan  The scan instance with the columns, time range etc.
393   * @param mapper  The mapper class to use.
394   * @param outputKeyClass  The class of the output key.
395   * @param outputValueClass  The class of the output value.
396   * @param job  The current job to adjust.  Make sure the passed job is
397   * carrying all necessary HBase configuration.
398   * @param addDependencyJars upload HBase jars and jars for any of the configured
399   *           job classes via the distributed cache (tmpjars).
400   *
401   * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should
402   * have write permissions to this directory, and this should not be a subdirectory of rootdir.
403   * After the job is finished, restore directory can be deleted.
404   * @param splitAlgo algorithm to split
405   * @param numSplitsPerRegion how many input splits to generate per one region
406   * @throws IOException When setting up the details fails.
407   * @see TableSnapshotInputFormat
408   */
409  public static void initTableSnapshotMapperJob(String snapshotName, Scan scan,
410                                                Class<? extends TableMapper> mapper,
411                                                Class<?> outputKeyClass,
412                                                Class<?> outputValueClass, Job job,
413                                                boolean addDependencyJars, Path tmpRestoreDir,
414                                                RegionSplitter.SplitAlgorithm splitAlgo,
415                                                int numSplitsPerRegion)
416          throws IOException {
417    TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir, splitAlgo,
418            numSplitsPerRegion);
419    initTableMapperJob(snapshotName, scan, mapper, outputKeyClass,
420            outputValueClass, job, addDependencyJars, false, TableSnapshotInputFormat.class);
421    resetCacheConfig(job.getConfiguration());
422  }
423
424  /**
425   * Use this before submitting a Multi TableMap job. It will appropriately set
426   * up the job.
427   *
428   * @param scans The list of {@link Scan} objects to read from.
429   * @param mapper The mapper class to use.
430   * @param outputKeyClass The class of the output key.
431   * @param outputValueClass The class of the output value.
432   * @param job The current job to adjust. Make sure the passed job is carrying
433   *          all necessary HBase configuration.
434   * @throws IOException When setting up the details fails.
435   */
436  public static void initTableMapperJob(List<Scan> scans,
437      Class<? extends TableMapper> mapper,
438      Class<?> outputKeyClass,
439      Class<?> outputValueClass, Job job) throws IOException {
440    initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
441        true);
442  }
443
444  /**
445   * Use this before submitting a Multi TableMap job. It will appropriately set
446   * up the job.
447   *
448   * @param scans The list of {@link Scan} objects to read from.
449   * @param mapper The mapper class to use.
450   * @param outputKeyClass The class of the output key.
451   * @param outputValueClass The class of the output value.
452   * @param job The current job to adjust. Make sure the passed job is carrying
453   *          all necessary HBase configuration.
454   * @param addDependencyJars upload HBase jars and jars for any of the
455   *          configured job classes via the distributed cache (tmpjars).
456   * @throws IOException When setting up the details fails.
457   */
458  public static void initTableMapperJob(List<Scan> scans,
459      Class<? extends TableMapper> mapper,
460      Class<?> outputKeyClass,
461      Class<?> outputValueClass, Job job,
462      boolean addDependencyJars) throws IOException {
463    initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
464      addDependencyJars, true);
465  }
466
467  /**
468   * Use this before submitting a Multi TableMap job. It will appropriately set
469   * up the job.
470   *
471   * @param scans The list of {@link Scan} objects to read from.
472   * @param mapper The mapper class to use.
473   * @param outputKeyClass The class of the output key.
474   * @param outputValueClass The class of the output value.
475   * @param job The current job to adjust. Make sure the passed job is carrying
476   *          all necessary HBase configuration.
477   * @param addDependencyJars upload HBase jars and jars for any of the
478   *          configured job classes via the distributed cache (tmpjars).
479   * @param initCredentials whether to initialize hbase auth credentials for the job
480   * @throws IOException When setting up the details fails.
481   */
482  public static void initTableMapperJob(List<Scan> scans,
483      Class<? extends TableMapper> mapper,
484      Class<?> outputKeyClass,
485      Class<?> outputValueClass, Job job,
486      boolean addDependencyJars,
487      boolean initCredentials) throws IOException {
488    job.setInputFormatClass(MultiTableInputFormat.class);
489    if (outputValueClass != null) {
490      job.setMapOutputValueClass(outputValueClass);
491    }
492    if (outputKeyClass != null) {
493      job.setMapOutputKeyClass(outputKeyClass);
494    }
495    job.setMapperClass(mapper);
496    Configuration conf = job.getConfiguration();
497    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
498    List<String> scanStrings = new ArrayList<>();
499
500    for (Scan scan : scans) {
501      scanStrings.add(convertScanToString(scan));
502    }
503    job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
504      scanStrings.toArray(new String[scanStrings.size()]));
505
506    if (addDependencyJars) {
507      addDependencyJars(job);
508    }
509
510    if (initCredentials) {
511      initCredentials(job);
512    }
513  }
514
515  public static void initCredentials(Job job) throws IOException {
516    UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
517    if (userProvider.isHadoopSecurityEnabled()) {
518      // propagate delegation related props from launcher job to MR job
519      if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
520        job.getConfiguration().set("mapreduce.job.credentials.binary",
521                                   System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
522      }
523    }
524
525    if (userProvider.isHBaseSecurityEnabled()) {
526      try {
527        // init credentials for remote cluster
528        String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
529        User user = userProvider.getCurrent();
530        if (quorumAddress != null) {
531          Configuration peerConf = HBaseConfiguration.createClusterConf(job.getConfiguration(),
532              quorumAddress, TableOutputFormat.OUTPUT_CONF_PREFIX);
533          Connection peerConn = ConnectionFactory.createConnection(peerConf);
534          try {
535            TokenUtil.addTokenForJob(peerConn, user, job);
536          } finally {
537            peerConn.close();
538          }
539        }
540
541        Connection conn = ConnectionFactory.createConnection(job.getConfiguration());
542        try {
543          TokenUtil.addTokenForJob(conn, user, job);
544        } finally {
545          conn.close();
546        }
547      } catch (InterruptedException ie) {
548        LOG.info("Interrupted obtaining user authentication token");
549        Thread.currentThread().interrupt();
550      }
551    }
552  }
553
554  /**
555   * Obtain an authentication token, for the specified cluster, on behalf of the current user
556   * and add it to the credentials for the given map reduce job.
557   *
558   * The quorumAddress is the key to the ZK ensemble, which contains:
559   * hbase.zookeeper.quorum, hbase.zookeeper.client.port and
560   * zookeeper.znode.parent
561   *
562   * @param job The job that requires the permission.
563   * @param quorumAddress string that contains the 3 required configuratins
564   * @throws IOException When the authentication token cannot be obtained.
565   * @deprecated Since 1.2.0 and will be removed in 3.0.0. Use
566   *   {@link #initCredentialsForCluster(Job, Configuration)} instead.
567   * @see #initCredentialsForCluster(Job, Configuration)
568   * @see <a href="https://issues.apache.org/jira/browse/HBASE-14886">HBASE-14886</a>
569   */
570  @Deprecated
571  public static void initCredentialsForCluster(Job job, String quorumAddress)
572      throws IOException {
573    Configuration peerConf = HBaseConfiguration.createClusterConf(job.getConfiguration(),
574        quorumAddress);
575    initCredentialsForCluster(job, peerConf);
576  }
577
578  /**
579   * Obtain an authentication token, for the specified cluster, on behalf of the current user
580   * and add it to the credentials for the given map reduce job.
581   *
582   * @param job The job that requires the permission.
583   * @param conf The configuration to use in connecting to the peer cluster
584   * @throws IOException When the authentication token cannot be obtained.
585   */
586  public static void initCredentialsForCluster(Job job, Configuration conf)
587      throws IOException {
588    UserProvider userProvider = UserProvider.instantiate(conf);
589    if (userProvider.isHBaseSecurityEnabled()) {
590      try {
591        Connection peerConn = ConnectionFactory.createConnection(conf);
592        try {
593          TokenUtil.addTokenForJob(peerConn, userProvider.getCurrent(), job);
594        } finally {
595          peerConn.close();
596        }
597      } catch (InterruptedException e) {
598        LOG.info("Interrupted obtaining user authentication token");
599        Thread.interrupted();
600      }
601    }
602  }
603
604  /**
605   * Writes the given scan into a Base64 encoded string.
606   *
607   * @param scan  The scan to write out.
608   * @return The scan saved in a Base64 encoded string.
609   * @throws IOException When writing the scan fails.
610   */
611  public static String convertScanToString(Scan scan) throws IOException {
612    ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
613    return Bytes.toString(Base64.getEncoder().encode(proto.toByteArray()));
614  }
615
616  /**
617   * Converts the given Base64 string back into a Scan instance.
618   *
619   * @param base64  The scan details.
620   * @return The newly created Scan instance.
621   * @throws IOException When reading the scan instance fails.
622   */
623  public static Scan convertStringToScan(String base64) throws IOException {
624    byte [] decoded = Base64.getDecoder().decode(base64);
625    return ProtobufUtil.toScan(ClientProtos.Scan.parseFrom(decoded));
626  }
627
628  /**
629   * Use this before submitting a TableReduce job. It will
630   * appropriately set up the JobConf.
631   *
632   * @param table  The output table.
633   * @param reducer  The reducer class to use.
634   * @param job  The current job to adjust.
635   * @throws IOException When determining the region count fails.
636   */
637  public static void initTableReducerJob(String table,
638    Class<? extends TableReducer> reducer, Job job)
639  throws IOException {
640    initTableReducerJob(table, reducer, job, null);
641  }
642
643  /**
644   * Use this before submitting a TableReduce job. It will
645   * appropriately set up the JobConf.
646   *
647   * @param table  The output table.
648   * @param reducer  The reducer class to use.
649   * @param job  The current job to adjust.
650   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
651   * default partitioner.
652   * @throws IOException When determining the region count fails.
653   */
654  public static void initTableReducerJob(String table,
655    Class<? extends TableReducer> reducer, Job job,
656    Class partitioner) throws IOException {
657    initTableReducerJob(table, reducer, job, partitioner, null, null, null);
658  }
659
660  /**
661   * Use this before submitting a TableReduce job. It will
662   * appropriately set up the JobConf.
663   *
664   * @param table  The output table.
665   * @param reducer  The reducer class to use.
666   * @param job  The current job to adjust.  Make sure the passed job is
667   * carrying all necessary HBase configuration.
668   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
669   * default partitioner.
670   * @param quorumAddress Distant cluster to write to; default is null for
671   * output to the cluster that is designated in <code>hbase-site.xml</code>.
672   * Set this String to the zookeeper ensemble of an alternate remote cluster
673   * when you would have the reduce write a cluster that is other than the
674   * default; e.g. copying tables between clusters, the source would be
675   * designated by <code>hbase-site.xml</code> and this param would have the
676   * ensemble address of the remote cluster.  The format to pass is particular.
677   * Pass <code> &lt;hbase.zookeeper.quorum&gt;:&lt;
678   *             hbase.zookeeper.client.port&gt;:&lt;zookeeper.znode.parent&gt;
679   * </code> such as <code>server,server2,server3:2181:/hbase</code>.
680   * @param serverClass redefined hbase.regionserver.class
681   * @param serverImpl redefined hbase.regionserver.impl
682   * @throws IOException When determining the region count fails.
683   */
684  public static void initTableReducerJob(String table,
685    Class<? extends TableReducer> reducer, Job job,
686    Class partitioner, String quorumAddress, String serverClass,
687    String serverImpl) throws IOException {
688    initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
689        serverClass, serverImpl, true);
690  }
691
692  /**
693   * Use this before submitting a TableReduce job. It will
694   * appropriately set up the JobConf.
695   *
696   * @param table  The output table.
697   * @param reducer  The reducer class to use.
698   * @param job  The current job to adjust.  Make sure the passed job is
699   * carrying all necessary HBase configuration.
700   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
701   * default partitioner.
702   * @param quorumAddress Distant cluster to write to; default is null for
703   * output to the cluster that is designated in <code>hbase-site.xml</code>.
704   * Set this String to the zookeeper ensemble of an alternate remote cluster
705   * when you would have the reduce write a cluster that is other than the
706   * default; e.g. copying tables between clusters, the source would be
707   * designated by <code>hbase-site.xml</code> and this param would have the
708   * ensemble address of the remote cluster.  The format to pass is particular.
709   * Pass <code> &lt;hbase.zookeeper.quorum&gt;:&lt;
710   *             hbase.zookeeper.client.port&gt;:&lt;zookeeper.znode.parent&gt;
711   * </code> such as <code>server,server2,server3:2181:/hbase</code>.
712   * @param serverClass redefined hbase.regionserver.class
713   * @param serverImpl redefined hbase.regionserver.impl
714   * @param addDependencyJars upload HBase jars and jars for any of the configured
715   *           job classes via the distributed cache (tmpjars).
716   * @throws IOException When determining the region count fails.
717   */
718  public static void initTableReducerJob(String table,
719    Class<? extends TableReducer> reducer, Job job,
720    Class partitioner, String quorumAddress, String serverClass,
721    String serverImpl, boolean addDependencyJars) throws IOException {
722
723    Configuration conf = job.getConfiguration();
724    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
725    job.setOutputFormatClass(TableOutputFormat.class);
726    if (reducer != null) job.setReducerClass(reducer);
727    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
728    conf.setStrings("io.serializations", conf.get("io.serializations"),
729        MutationSerialization.class.getName(), ResultSerialization.class.getName());
730    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
731    if (quorumAddress != null) {
732      // Calling this will validate the format
733      ZKConfig.validateClusterKey(quorumAddress);
734      conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
735    }
736    if (serverClass != null && serverImpl != null) {
737      conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
738      conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
739    }
740    job.setOutputKeyClass(ImmutableBytesWritable.class);
741    job.setOutputValueClass(Writable.class);
742    if (partitioner == HRegionPartitioner.class) {
743      job.setPartitionerClass(HRegionPartitioner.class);
744      int regions = getRegionCount(conf, TableName.valueOf(table));
745      if (job.getNumReduceTasks() > regions) {
746        job.setNumReduceTasks(regions);
747      }
748    } else if (partitioner != null) {
749      job.setPartitionerClass(partitioner);
750    }
751
752    if (addDependencyJars) {
753      addDependencyJars(job);
754    }
755
756    initCredentials(job);
757  }
758
759  /**
760   * Ensures that the given number of reduce tasks for the given job
761   * configuration does not exceed the number of regions for the given table.
762   *
763   * @param table  The table to get the region count for.
764   * @param job  The current job to adjust.
765   * @throws IOException When retrieving the table details fails.
766   */
767  public static void limitNumReduceTasks(String table, Job job) throws IOException {
768    int regions = getRegionCount(job.getConfiguration(), TableName.valueOf(table));
769    if (job.getNumReduceTasks() > regions) {
770      job.setNumReduceTasks(regions);
771    }
772  }
773
774  /**
775   * Sets the number of reduce tasks for the given job configuration to the
776   * number of regions the given table has.
777   *
778   * @param table  The table to get the region count for.
779   * @param job  The current job to adjust.
780   * @throws IOException When retrieving the table details fails.
781   */
782  public static void setNumReduceTasks(String table, Job job) throws IOException {
783    job.setNumReduceTasks(getRegionCount(job.getConfiguration(), TableName.valueOf(table)));
784  }
785
786  /**
787   * Sets the number of rows to return and cache with each scanner iteration.
788   * Higher caching values will enable faster mapreduce jobs at the expense of
789   * requiring more heap to contain the cached rows.
790   *
791   * @param job The current job to adjust.
792   * @param batchSize The number of rows to return in batch with each scanner
793   * iteration.
794   */
795  public static void setScannerCaching(Job job, int batchSize) {
796    job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
797  }
798
799  /**
800   * Add HBase and its dependencies (only) to the job configuration.
801   * <p>
802   * This is intended as a low-level API, facilitating code reuse between this
803   * class and its mapred counterpart. It also of use to external tools that
804   * need to build a MapReduce job that interacts with HBase but want
805   * fine-grained control over the jars shipped to the cluster.
806   * </p>
807   * @param conf The Configuration object to extend with dependencies.
808   * @see org.apache.hadoop.hbase.mapred.TableMapReduceUtil
809   * @see <a href="https://issues.apache.org/jira/browse/PIG-3285">PIG-3285</a>
810   */
811  public static void addHBaseDependencyJars(Configuration conf) throws IOException {
812    addDependencyJarsForClasses(conf,
813      // explicitly pull a class from each module
814      org.apache.hadoop.hbase.HConstants.class,                      // hbase-common
815      org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class, // hbase-protocol
816      org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.class, // hbase-protocol-shaded
817      org.apache.hadoop.hbase.client.Put.class,                      // hbase-client
818      org.apache.hadoop.hbase.ipc.RpcServer.class,                   // hbase-server
819      org.apache.hadoop.hbase.CompatibilityFactory.class,            // hbase-hadoop-compat
820      org.apache.hadoop.hbase.mapreduce.JobUtil.class,               // hbase-hadoop2-compat
821      org.apache.hadoop.hbase.mapreduce.TableMapper.class,           // hbase-mapreduce
822      org.apache.hadoop.hbase.metrics.impl.FastLongHistogram.class,  // hbase-metrics
823      org.apache.hadoop.hbase.metrics.Snapshot.class,                // hbase-metrics-api
824      org.apache.hadoop.hbase.replication.ReplicationUtils.class,    // hbase-replication
825      org.apache.hadoop.hbase.http.HttpServer.class,                 // hbase-http
826      org.apache.hadoop.hbase.procedure2.Procedure.class,            // hbase-procedure
827      org.apache.hadoop.hbase.zookeeper.ZKWatcher.class,             // hbase-zookeeper
828      org.apache.hbase.thirdparty.com.google.common.collect.Lists.class, // hb-shaded-miscellaneous
829      org.apache.hbase.thirdparty.com.google.gson.GsonBuilder.class, // hbase-shaded-gson
830      org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations.class, // hb-sh-protobuf
831      org.apache.hbase.thirdparty.io.netty.channel.Channel.class,    // hbase-shaded-netty
832      org.apache.zookeeper.ZooKeeper.class,                          // zookeeper
833      com.google.protobuf.Message.class,                             // protobuf
834      org.apache.htrace.core.Tracer.class,                           // htrace
835      com.codahale.metrics.MetricRegistry.class,                     // metrics-core
836      org.apache.commons.lang3.ArrayUtils.class);                    // commons-lang
837  }
838
839  /**
840   * Returns a classpath string built from the content of the "tmpjars" value in {@code conf}.
841   * Also exposed to shell scripts via `bin/hbase mapredcp`.
842   */
843  public static String buildDependencyClasspath(Configuration conf) {
844    if (conf == null) {
845      throw new IllegalArgumentException("Must provide a configuration object.");
846    }
847    Set<String> paths = new HashSet<>(conf.getStringCollection("tmpjars"));
848    if (paths.isEmpty()) {
849      throw new IllegalArgumentException("Configuration contains no tmpjars.");
850    }
851    StringBuilder sb = new StringBuilder();
852    for (String s : paths) {
853      // entries can take the form 'file:/path/to/file.jar'.
854      int idx = s.indexOf(":");
855      if (idx != -1) s = s.substring(idx + 1);
856      if (sb.length() > 0) sb.append(File.pathSeparator);
857      sb.append(s);
858    }
859    return sb.toString();
860  }
861
862  /**
863   * Add the HBase dependency jars as well as jars for any of the configured
864   * job classes to the job configuration, so that JobClient will ship them
865   * to the cluster and add them to the DistributedCache.
866   */
867  public static void addDependencyJars(Job job) throws IOException {
868    addHBaseDependencyJars(job.getConfiguration());
869    try {
870      addDependencyJarsForClasses(job.getConfiguration(),
871          // when making changes here, consider also mapred.TableMapReduceUtil
872          // pull job classes
873          job.getMapOutputKeyClass(),
874          job.getMapOutputValueClass(),
875          job.getInputFormatClass(),
876          job.getOutputKeyClass(),
877          job.getOutputValueClass(),
878          job.getOutputFormatClass(),
879          job.getPartitionerClass(),
880          job.getCombinerClass());
881    } catch (ClassNotFoundException e) {
882      throw new IOException(e);
883    }
884  }
885
886  /**
887   * Add the jars containing the given classes to the job's configuration
888   * such that JobClient will ship them to the cluster and add them to
889   * the DistributedCache.
890   * @deprecated since 1.3.0 and will be removed in 3.0.0. Use {@link #addDependencyJars(Job)}
891   *   instead.
892   * @see #addDependencyJars(Job)
893   * @see <a href="https://issues.apache.org/jira/browse/HBASE-8386">HBASE-8386</a>
894   */
895  @Deprecated
896  public static void addDependencyJars(Configuration conf,
897      Class<?>... classes) throws IOException {
898    LOG.warn("The addDependencyJars(Configuration, Class<?>...) method has been deprecated since it"
899             + " is easy to use incorrectly. Most users should rely on addDependencyJars(Job) " +
900             "instead. See HBASE-8386 for more details.");
901    addDependencyJarsForClasses(conf, classes);
902  }
903
904  /**
905   * Add the jars containing the given classes to the job's configuration
906   * such that JobClient will ship them to the cluster and add them to
907   * the DistributedCache.
908   *
909   * N.B. that this method at most adds one jar per class given. If there is more than one
910   * jar available containing a class with the same name as a given class, we don't define
911   * which of those jars might be chosen.
912   *
913   * @param conf The Hadoop Configuration to modify
914   * @param classes will add just those dependencies needed to find the given classes
915   * @throws IOException if an underlying library call fails.
916   */
917  @InterfaceAudience.Private
918  public static void addDependencyJarsForClasses(Configuration conf,
919      Class<?>... classes) throws IOException {
920
921    FileSystem localFs = FileSystem.getLocal(conf);
922    Set<String> jars = new HashSet<>();
923    // Add jars that are already in the tmpjars variable
924    jars.addAll(conf.getStringCollection("tmpjars"));
925
926    // add jars as we find them to a map of contents jar name so that we can avoid
927    // creating new jars for classes that have already been packaged.
928    Map<String, String> packagedClasses = new HashMap<>();
929
930    // Add jars containing the specified classes
931    for (Class<?> clazz : classes) {
932      if (clazz == null) continue;
933
934      Path path = findOrCreateJar(clazz, localFs, packagedClasses);
935      if (path == null) {
936        LOG.warn("Could not find jar for class " + clazz +
937                 " in order to ship it to the cluster.");
938        continue;
939      }
940      if (!localFs.exists(path)) {
941        LOG.warn("Could not validate jar file " + path + " for class "
942                 + clazz);
943        continue;
944      }
945      jars.add(path.toString());
946    }
947    if (jars.isEmpty()) return;
948
949    conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
950  }
951
952  /**
953   * Finds the Jar for a class or creates it if it doesn't exist. If the class is in
954   * a directory in the classpath, it creates a Jar on the fly with the
955   * contents of the directory and returns the path to that Jar. If a Jar is
956   * created, it is created in the system temporary directory. Otherwise,
957   * returns an existing jar that contains a class of the same name. Maintains
958   * a mapping from jar contents to the tmp jar created.
959   * @param my_class the class to find.
960   * @param fs the FileSystem with which to qualify the returned path.
961   * @param packagedClasses a map of class name to path.
962   * @return a jar file that contains the class.
963   * @throws IOException
964   */
965  private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
966      Map<String, String> packagedClasses)
967  throws IOException {
968    // attempt to locate an existing jar for the class.
969    String jar = findContainingJar(my_class, packagedClasses);
970    if (null == jar || jar.isEmpty()) {
971      jar = getJar(my_class);
972      updateMap(jar, packagedClasses);
973    }
974
975    if (null == jar || jar.isEmpty()) {
976      return null;
977    }
978
979    LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
980    return new Path(jar).makeQualified(fs.getUri(), fs.getWorkingDirectory());
981  }
982
983  /**
984   * Add entries to <code>packagedClasses</code> corresponding to class files
985   * contained in <code>jar</code>.
986   * @param jar The jar who's content to list.
987   * @param packagedClasses map[class -> jar]
988   */
989  private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
990    if (null == jar || jar.isEmpty()) {
991      return;
992    }
993    ZipFile zip = null;
994    try {
995      zip = new ZipFile(jar);
996      for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
997        ZipEntry entry = iter.nextElement();
998        if (entry.getName().endsWith("class")) {
999          packagedClasses.put(entry.getName(), jar);
1000        }
1001      }
1002    } finally {
1003      if (null != zip) zip.close();
1004    }
1005  }
1006
1007  /**
1008   * Find a jar that contains a class of the same name, if any. It will return
1009   * a jar file, even if that is not the first thing on the class path that
1010   * has a class with the same name. Looks first on the classpath and then in
1011   * the <code>packagedClasses</code> map.
1012   * @param my_class the class to find.
1013   * @return a jar file that contains the class, or null.
1014   * @throws IOException
1015   */
1016  private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
1017      throws IOException {
1018    ClassLoader loader = my_class.getClassLoader();
1019
1020    String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
1021
1022    if (loader != null) {
1023      // first search the classpath
1024      for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
1025        URL url = itr.nextElement();
1026        if ("jar".equals(url.getProtocol())) {
1027          String toReturn = url.getPath();
1028          if (toReturn.startsWith("file:")) {
1029            toReturn = toReturn.substring("file:".length());
1030          }
1031          // URLDecoder is a misnamed class, since it actually decodes
1032          // x-www-form-urlencoded MIME type rather than actual
1033          // URL encoding (which the file path has). Therefore it would
1034          // decode +s to ' 's which is incorrect (spaces are actually
1035          // either unencoded or encoded as "%20"). Replace +s first, so
1036          // that they are kept sacred during the decoding process.
1037          toReturn = toReturn.replaceAll("\\+", "%2B");
1038          toReturn = URLDecoder.decode(toReturn, "UTF-8");
1039          return toReturn.replaceAll("!.*$", "");
1040        }
1041      }
1042    }
1043
1044    // now look in any jars we've packaged using JarFinder. Returns null when
1045    // no jar is found.
1046    return packagedClasses.get(class_file);
1047  }
1048
1049  /**
1050   * Invoke 'getJar' on a custom JarFinder implementation. Useful for some job
1051   * configuration contexts (HBASE-8140) and also for testing on MRv2.
1052   * check if we have HADOOP-9426.
1053   * @param my_class the class to find.
1054   * @return a jar file that contains the class, or null.
1055   */
1056  private static String getJar(Class<?> my_class) {
1057    String ret = null;
1058    try {
1059      ret = JarFinder.getJar(my_class);
1060    } catch (Exception e) {
1061      // toss all other exceptions, related to reflection failure
1062      throw new RuntimeException("getJar invocation failed.", e);
1063    }
1064
1065    return ret;
1066  }
1067
1068  private static int getRegionCount(Configuration conf, TableName tableName) throws IOException {
1069    try (Connection conn = ConnectionFactory.createConnection(conf);
1070      RegionLocator locator = conn.getRegionLocator(tableName)) {
1071      return locator.getAllRegionLocations().size();
1072    }
1073  }
1074}