001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.File; 022import java.io.IOException; 023import java.net.URL; 024import java.net.URLDecoder; 025import java.util.ArrayList; 026import java.util.Base64; 027import java.util.Collection; 028import java.util.Enumeration; 029import java.util.HashMap; 030import java.util.HashSet; 031import java.util.List; 032import java.util.Map; 033import java.util.Set; 034import java.util.zip.ZipEntry; 035import java.util.zip.ZipFile; 036 037import org.apache.hadoop.conf.Configuration; 038import org.apache.hadoop.fs.FileSystem; 039import org.apache.hadoop.fs.Path; 040import org.apache.hadoop.hbase.HBaseConfiguration; 041import org.apache.hadoop.hbase.HConstants; 042import org.apache.hadoop.hbase.MetaTableAccessor; 043import org.apache.hadoop.hbase.TableName; 044import org.apache.yetus.audience.InterfaceAudience; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047import org.apache.hadoop.hbase.client.Connection; 048import org.apache.hadoop.hbase.client.ConnectionFactory; 049import org.apache.hadoop.hbase.client.Put; 050import org.apache.hadoop.hbase.client.Scan; 051import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 052import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 053import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; 054import org.apache.hadoop.hbase.security.User; 055import org.apache.hadoop.hbase.security.UserProvider; 056import org.apache.hadoop.hbase.security.token.TokenUtil; 057import org.apache.hadoop.hbase.util.Bytes; 058import org.apache.hadoop.hbase.util.RegionSplitter; 059import org.apache.hadoop.hbase.zookeeper.ZKConfig; 060import org.apache.hadoop.io.Writable; 061import org.apache.hadoop.mapreduce.InputFormat; 062import org.apache.hadoop.mapreduce.Job; 063import org.apache.hadoop.util.StringUtils; 064 065import com.codahale.metrics.MetricRegistry; 066 067/** 068 * Utility for {@link TableMapper} and {@link TableReducer} 069 */ 070@SuppressWarnings({ "rawtypes", "unchecked" }) 071@InterfaceAudience.Public 072public class TableMapReduceUtil { 073 private static final Logger LOG = LoggerFactory.getLogger(TableMapReduceUtil.class); 074 public static final String TABLE_INPUT_CLASS_KEY = "hbase.table.input.class"; 075 076 /** 077 * Use this before submitting a TableMap job. It will appropriately set up 078 * the job. 079 * 080 * @param table The table name to read from. 081 * @param scan The scan instance with the columns, time range etc. 082 * @param mapper The mapper class to use. 083 * @param outputKeyClass The class of the output key. 084 * @param outputValueClass The class of the output value. 085 * @param job The current job to adjust. Make sure the passed job is 086 * carrying all necessary HBase configuration. 087 * @throws IOException When setting up the details fails. 088 */ 089 public static void initTableMapperJob(String table, Scan scan, 090 Class<? extends TableMapper> mapper, 091 Class<?> outputKeyClass, 092 Class<?> outputValueClass, Job job) 093 throws IOException { 094 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass, 095 job, true); 096 } 097 098 099 /** 100 * Use this before submitting a TableMap job. It will appropriately set up 101 * the job. 102 * 103 * @param table The table name to read from. 104 * @param scan The scan instance with the columns, time range etc. 105 * @param mapper The mapper class to use. 106 * @param outputKeyClass The class of the output key. 107 * @param outputValueClass The class of the output value. 108 * @param job The current job to adjust. Make sure the passed job is 109 * carrying all necessary HBase configuration. 110 * @throws IOException When setting up the details fails. 111 */ 112 public static void initTableMapperJob(TableName table, 113 Scan scan, 114 Class<? extends TableMapper> mapper, 115 Class<?> outputKeyClass, 116 Class<?> outputValueClass, 117 Job job) throws IOException { 118 initTableMapperJob(table.getNameAsString(), 119 scan, 120 mapper, 121 outputKeyClass, 122 outputValueClass, 123 job, 124 true); 125 } 126 127 /** 128 * Use this before submitting a TableMap job. It will appropriately set up 129 * the job. 130 * 131 * @param table Binary representation of the table name to read from. 132 * @param scan The scan instance with the columns, time range etc. 133 * @param mapper The mapper class to use. 134 * @param outputKeyClass The class of the output key. 135 * @param outputValueClass The class of the output value. 136 * @param job The current job to adjust. Make sure the passed job is 137 * carrying all necessary HBase configuration. 138 * @throws IOException When setting up the details fails. 139 */ 140 public static void initTableMapperJob(byte[] table, Scan scan, 141 Class<? extends TableMapper> mapper, 142 Class<?> outputKeyClass, 143 Class<?> outputValueClass, Job job) 144 throws IOException { 145 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass, 146 job, true); 147 } 148 149 /** 150 * Use this before submitting a TableMap job. It will appropriately set up 151 * the job. 152 * 153 * @param table The table name to read from. 154 * @param scan The scan instance with the columns, time range etc. 155 * @param mapper The mapper class to use. 156 * @param outputKeyClass The class of the output key. 157 * @param outputValueClass The class of the output value. 158 * @param job The current job to adjust. Make sure the passed job is 159 * carrying all necessary HBase configuration. 160 * @param addDependencyJars upload HBase jars and jars for any of the configured 161 * job classes via the distributed cache (tmpjars). 162 * @throws IOException When setting up the details fails. 163 */ 164 public static void initTableMapperJob(String table, Scan scan, 165 Class<? extends TableMapper> mapper, 166 Class<?> outputKeyClass, 167 Class<?> outputValueClass, Job job, 168 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass) 169 throws IOException { 170 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass, job, 171 addDependencyJars, true, inputFormatClass); 172 } 173 174 175 /** 176 * Use this before submitting a TableMap job. It will appropriately set up 177 * the job. 178 * 179 * @param table The table name to read from. 180 * @param scan The scan instance with the columns, time range etc. 181 * @param mapper The mapper class to use. 182 * @param outputKeyClass The class of the output key. 183 * @param outputValueClass The class of the output value. 184 * @param job The current job to adjust. Make sure the passed job is 185 * carrying all necessary HBase configuration. 186 * @param addDependencyJars upload HBase jars and jars for any of the configured 187 * job classes via the distributed cache (tmpjars). 188 * @param initCredentials whether to initialize hbase auth credentials for the job 189 * @param inputFormatClass the input format 190 * @throws IOException When setting up the details fails. 191 */ 192 public static void initTableMapperJob(String table, Scan scan, 193 Class<? extends TableMapper> mapper, 194 Class<?> outputKeyClass, 195 Class<?> outputValueClass, Job job, 196 boolean addDependencyJars, boolean initCredentials, 197 Class<? extends InputFormat> inputFormatClass) 198 throws IOException { 199 job.setInputFormatClass(inputFormatClass); 200 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass); 201 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass); 202 job.setMapperClass(mapper); 203 if (Put.class.equals(outputValueClass)) { 204 job.setCombinerClass(PutCombiner.class); 205 } 206 Configuration conf = job.getConfiguration(); 207 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); 208 conf.set(TableInputFormat.INPUT_TABLE, table); 209 conf.set(TableInputFormat.SCAN, convertScanToString(scan)); 210 conf.setStrings("io.serializations", conf.get("io.serializations"), 211 MutationSerialization.class.getName(), ResultSerialization.class.getName(), 212 CellSerialization.class.getName()); 213 if (addDependencyJars) { 214 addDependencyJars(job); 215 } 216 if (initCredentials) { 217 initCredentials(job); 218 } 219 } 220 221 /** 222 * Use this before submitting a TableMap job. It will appropriately set up 223 * the job. 224 * 225 * @param table Binary representation of the table name to read from. 226 * @param scan The scan instance with the columns, time range etc. 227 * @param mapper The mapper class to use. 228 * @param outputKeyClass The class of the output key. 229 * @param outputValueClass The class of the output value. 230 * @param job The current job to adjust. Make sure the passed job is 231 * carrying all necessary HBase configuration. 232 * @param addDependencyJars upload HBase jars and jars for any of the configured 233 * job classes via the distributed cache (tmpjars). 234 * @param inputFormatClass The class of the input format 235 * @throws IOException When setting up the details fails. 236 */ 237 public static void initTableMapperJob(byte[] table, Scan scan, 238 Class<? extends TableMapper> mapper, 239 Class<?> outputKeyClass, 240 Class<?> outputValueClass, Job job, 241 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass) 242 throws IOException { 243 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, 244 outputValueClass, job, addDependencyJars, inputFormatClass); 245 } 246 247 /** 248 * Use this before submitting a TableMap job. It will appropriately set up 249 * the job. 250 * 251 * @param table Binary representation of the table name to read from. 252 * @param scan The scan instance with the columns, time range etc. 253 * @param mapper The mapper class to use. 254 * @param outputKeyClass The class of the output key. 255 * @param outputValueClass The class of the output value. 256 * @param job The current job to adjust. Make sure the passed job is 257 * carrying all necessary HBase configuration. 258 * @param addDependencyJars upload HBase jars and jars for any of the configured 259 * job classes via the distributed cache (tmpjars). 260 * @throws IOException When setting up the details fails. 261 */ 262 public static void initTableMapperJob(byte[] table, Scan scan, 263 Class<? extends TableMapper> mapper, 264 Class<?> outputKeyClass, 265 Class<?> outputValueClass, Job job, 266 boolean addDependencyJars) 267 throws IOException { 268 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass, job, 269 addDependencyJars, getConfiguredInputFormat(job)); 270 } 271 272 /** 273 * @return {@link TableInputFormat} .class unless Configuration has something else at 274 * {@link #TABLE_INPUT_CLASS_KEY}. 275 */ 276 private static Class<? extends InputFormat> getConfiguredInputFormat(Job job) { 277 return (Class<? extends InputFormat>)job.getConfiguration(). 278 getClass(TABLE_INPUT_CLASS_KEY, TableInputFormat.class); 279 } 280 281 /** 282 * Use this before submitting a TableMap job. It will appropriately set up 283 * the job. 284 * 285 * @param table The table name to read from. 286 * @param scan The scan instance with the columns, time range etc. 287 * @param mapper The mapper class to use. 288 * @param outputKeyClass The class of the output key. 289 * @param outputValueClass The class of the output value. 290 * @param job The current job to adjust. Make sure the passed job is 291 * carrying all necessary HBase configuration. 292 * @param addDependencyJars upload HBase jars and jars for any of the configured 293 * job classes via the distributed cache (tmpjars). 294 * @throws IOException When setting up the details fails. 295 */ 296 public static void initTableMapperJob(String table, Scan scan, 297 Class<? extends TableMapper> mapper, 298 Class<?> outputKeyClass, 299 Class<?> outputValueClass, Job job, 300 boolean addDependencyJars) 301 throws IOException { 302 initTableMapperJob(table, scan, mapper, outputKeyClass, 303 outputValueClass, job, addDependencyJars, getConfiguredInputFormat(job)); 304 } 305 306 /** 307 * Enable a basic on-heap cache for these jobs. Any BlockCache implementation based on 308 * direct memory will likely cause the map tasks to OOM when opening the region. This 309 * is done here instead of in TableSnapshotRegionRecordReader in case an advanced user 310 * wants to override this behavior in their job. 311 */ 312 public static void resetCacheConfig(Configuration conf) { 313 conf.setFloat( 314 HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT); 315 conf.setFloat(HConstants.BUCKET_CACHE_SIZE_KEY, 0f); 316 conf.unset(HConstants.BUCKET_CACHE_IOENGINE_KEY); 317 } 318 319 /** 320 * Sets up the job for reading from one or more table snapshots, with one or more scans 321 * per snapshot. 322 * It bypasses hbase servers and read directly from snapshot files. 323 * 324 * @param snapshotScans map of snapshot name to scans on that snapshot. 325 * @param mapper The mapper class to use. 326 * @param outputKeyClass The class of the output key. 327 * @param outputValueClass The class of the output value. 328 * @param job The current job to adjust. Make sure the passed job is 329 * carrying all necessary HBase configuration. 330 * @param addDependencyJars upload HBase jars and jars for any of the configured 331 * job classes via the distributed cache (tmpjars). 332 */ 333 public static void initMultiTableSnapshotMapperJob(Map<String, Collection<Scan>> snapshotScans, 334 Class<? extends TableMapper> mapper, Class<?> outputKeyClass, Class<?> outputValueClass, 335 Job job, boolean addDependencyJars, Path tmpRestoreDir) throws IOException { 336 MultiTableSnapshotInputFormat.setInput(job.getConfiguration(), snapshotScans, tmpRestoreDir); 337 338 job.setInputFormatClass(MultiTableSnapshotInputFormat.class); 339 if (outputValueClass != null) { 340 job.setMapOutputValueClass(outputValueClass); 341 } 342 if (outputKeyClass != null) { 343 job.setMapOutputKeyClass(outputKeyClass); 344 } 345 job.setMapperClass(mapper); 346 Configuration conf = job.getConfiguration(); 347 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); 348 349 if (addDependencyJars) { 350 addDependencyJars(job); 351 addDependencyJarsForClasses(job.getConfiguration(), MetricRegistry.class); 352 } 353 354 resetCacheConfig(job.getConfiguration()); 355 } 356 357 /** 358 * Sets up the job for reading from a table snapshot. It bypasses hbase servers and read directly 359 * from snapshot files. 360 * @param snapshotName The name of the snapshot (of a table) to read from. 361 * @param scan The scan instance with the columns, time range etc. 362 * @param mapper The mapper class to use. 363 * @param outputKeyClass The class of the output key. 364 * @param outputValueClass The class of the output value. 365 * @param job The current job to adjust. Make sure the passed job is carrying all necessary HBase 366 * configuration. 367 * @param addDependencyJars upload HBase jars and jars for any of the configured job classes via 368 * the distributed cache (tmpjars). 369 * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should 370 * have write permissions to this directory, and this should not be a subdirectory of 371 * rootdir. After the job is finished, restore directory can be deleted. 372 * @throws IOException When setting up the details fails. 373 * @see TableSnapshotInputFormat 374 */ 375 public static void initTableSnapshotMapperJob(String snapshotName, Scan scan, 376 Class<? extends TableMapper> mapper, 377 Class<?> outputKeyClass, 378 Class<?> outputValueClass, Job job, 379 boolean addDependencyJars, Path tmpRestoreDir) 380 throws IOException { 381 TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir); 382 initTableMapperJob(snapshotName, scan, mapper, outputKeyClass, outputValueClass, job, 383 addDependencyJars, false, TableSnapshotInputFormat.class); 384 resetCacheConfig(job.getConfiguration()); 385 } 386 387 /** 388 * Sets up the job for reading from a table snapshot. It bypasses hbase servers 389 * and read directly from snapshot files. 390 * 391 * @param snapshotName The name of the snapshot (of a table) to read from. 392 * @param scan The scan instance with the columns, time range etc. 393 * @param mapper The mapper class to use. 394 * @param outputKeyClass The class of the output key. 395 * @param outputValueClass The class of the output value. 396 * @param job The current job to adjust. Make sure the passed job is 397 * carrying all necessary HBase configuration. 398 * @param addDependencyJars upload HBase jars and jars for any of the configured 399 * job classes via the distributed cache (tmpjars). 400 * 401 * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should 402 * have write permissions to this directory, and this should not be a subdirectory of rootdir. 403 * After the job is finished, restore directory can be deleted. 404 * @param splitAlgo algorithm to split 405 * @param numSplitsPerRegion how many input splits to generate per one region 406 * @throws IOException When setting up the details fails. 407 * @see TableSnapshotInputFormat 408 */ 409 public static void initTableSnapshotMapperJob(String snapshotName, Scan scan, 410 Class<? extends TableMapper> mapper, 411 Class<?> outputKeyClass, 412 Class<?> outputValueClass, Job job, 413 boolean addDependencyJars, Path tmpRestoreDir, 414 RegionSplitter.SplitAlgorithm splitAlgo, 415 int numSplitsPerRegion) 416 throws IOException { 417 TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir, splitAlgo, 418 numSplitsPerRegion); 419 initTableMapperJob(snapshotName, scan, mapper, outputKeyClass, 420 outputValueClass, job, addDependencyJars, false, TableSnapshotInputFormat.class); 421 resetCacheConfig(job.getConfiguration()); 422 } 423 424 /** 425 * Use this before submitting a Multi TableMap job. It will appropriately set 426 * up the job. 427 * 428 * @param scans The list of {@link Scan} objects to read from. 429 * @param mapper The mapper class to use. 430 * @param outputKeyClass The class of the output key. 431 * @param outputValueClass The class of the output value. 432 * @param job The current job to adjust. Make sure the passed job is carrying 433 * all necessary HBase configuration. 434 * @throws IOException When setting up the details fails. 435 */ 436 public static void initTableMapperJob(List<Scan> scans, 437 Class<? extends TableMapper> mapper, 438 Class<?> outputKeyClass, 439 Class<?> outputValueClass, Job job) throws IOException { 440 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job, 441 true); 442 } 443 444 /** 445 * Use this before submitting a Multi TableMap job. It will appropriately set 446 * up the job. 447 * 448 * @param scans The list of {@link Scan} objects to read from. 449 * @param mapper The mapper class to use. 450 * @param outputKeyClass The class of the output key. 451 * @param outputValueClass The class of the output value. 452 * @param job The current job to adjust. Make sure the passed job is carrying 453 * all necessary HBase configuration. 454 * @param addDependencyJars upload HBase jars and jars for any of the 455 * configured job classes via the distributed cache (tmpjars). 456 * @throws IOException When setting up the details fails. 457 */ 458 public static void initTableMapperJob(List<Scan> scans, 459 Class<? extends TableMapper> mapper, 460 Class<?> outputKeyClass, 461 Class<?> outputValueClass, Job job, 462 boolean addDependencyJars) throws IOException { 463 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job, 464 addDependencyJars, true); 465 } 466 467 /** 468 * Use this before submitting a Multi TableMap job. It will appropriately set 469 * up the job. 470 * 471 * @param scans The list of {@link Scan} objects to read from. 472 * @param mapper The mapper class to use. 473 * @param outputKeyClass The class of the output key. 474 * @param outputValueClass The class of the output value. 475 * @param job The current job to adjust. Make sure the passed job is carrying 476 * all necessary HBase configuration. 477 * @param addDependencyJars upload HBase jars and jars for any of the 478 * configured job classes via the distributed cache (tmpjars). 479 * @param initCredentials whether to initialize hbase auth credentials for the job 480 * @throws IOException When setting up the details fails. 481 */ 482 public static void initTableMapperJob(List<Scan> scans, 483 Class<? extends TableMapper> mapper, 484 Class<?> outputKeyClass, 485 Class<?> outputValueClass, Job job, 486 boolean addDependencyJars, 487 boolean initCredentials) throws IOException { 488 job.setInputFormatClass(MultiTableInputFormat.class); 489 if (outputValueClass != null) { 490 job.setMapOutputValueClass(outputValueClass); 491 } 492 if (outputKeyClass != null) { 493 job.setMapOutputKeyClass(outputKeyClass); 494 } 495 job.setMapperClass(mapper); 496 Configuration conf = job.getConfiguration(); 497 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); 498 List<String> scanStrings = new ArrayList<>(); 499 500 for (Scan scan : scans) { 501 scanStrings.add(convertScanToString(scan)); 502 } 503 job.getConfiguration().setStrings(MultiTableInputFormat.SCANS, 504 scanStrings.toArray(new String[scanStrings.size()])); 505 506 if (addDependencyJars) { 507 addDependencyJars(job); 508 } 509 510 if (initCredentials) { 511 initCredentials(job); 512 } 513 } 514 515 public static void initCredentials(Job job) throws IOException { 516 UserProvider userProvider = UserProvider.instantiate(job.getConfiguration()); 517 if (userProvider.isHadoopSecurityEnabled()) { 518 // propagate delegation related props from launcher job to MR job 519 if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { 520 job.getConfiguration().set("mapreduce.job.credentials.binary", 521 System.getenv("HADOOP_TOKEN_FILE_LOCATION")); 522 } 523 } 524 525 if (userProvider.isHBaseSecurityEnabled()) { 526 try { 527 // init credentials for remote cluster 528 String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS); 529 User user = userProvider.getCurrent(); 530 if (quorumAddress != null) { 531 Configuration peerConf = HBaseConfiguration.createClusterConf(job.getConfiguration(), 532 quorumAddress, TableOutputFormat.OUTPUT_CONF_PREFIX); 533 Connection peerConn = ConnectionFactory.createConnection(peerConf); 534 try { 535 TokenUtil.addTokenForJob(peerConn, user, job); 536 } finally { 537 peerConn.close(); 538 } 539 } 540 541 Connection conn = ConnectionFactory.createConnection(job.getConfiguration()); 542 try { 543 TokenUtil.addTokenForJob(conn, user, job); 544 } finally { 545 conn.close(); 546 } 547 } catch (InterruptedException ie) { 548 LOG.info("Interrupted obtaining user authentication token"); 549 Thread.currentThread().interrupt(); 550 } 551 } 552 } 553 554 /** 555 * Obtain an authentication token, for the specified cluster, on behalf of the current user 556 * and add it to the credentials for the given map reduce job. 557 * 558 * The quorumAddress is the key to the ZK ensemble, which contains: 559 * hbase.zookeeper.quorum, hbase.zookeeper.client.port and 560 * zookeeper.znode.parent 561 * 562 * @param job The job that requires the permission. 563 * @param quorumAddress string that contains the 3 required configuratins 564 * @throws IOException When the authentication token cannot be obtained. 565 * @deprecated Since 1.2.0 and will be removed in 3.0.0. Use 566 * {@link #initCredentialsForCluster(Job, Configuration)} instead. 567 * @see #initCredentialsForCluster(Job, Configuration) 568 * @see <a href="https://issues.apache.org/jira/browse/HBASE-14886">HBASE-14886</a> 569 */ 570 @Deprecated 571 public static void initCredentialsForCluster(Job job, String quorumAddress) 572 throws IOException { 573 Configuration peerConf = HBaseConfiguration.createClusterConf(job.getConfiguration(), 574 quorumAddress); 575 initCredentialsForCluster(job, peerConf); 576 } 577 578 /** 579 * Obtain an authentication token, for the specified cluster, on behalf of the current user 580 * and add it to the credentials for the given map reduce job. 581 * 582 * @param job The job that requires the permission. 583 * @param conf The configuration to use in connecting to the peer cluster 584 * @throws IOException When the authentication token cannot be obtained. 585 */ 586 public static void initCredentialsForCluster(Job job, Configuration conf) 587 throws IOException { 588 UserProvider userProvider = UserProvider.instantiate(conf); 589 if (userProvider.isHBaseSecurityEnabled()) { 590 try { 591 Connection peerConn = ConnectionFactory.createConnection(conf); 592 try { 593 TokenUtil.addTokenForJob(peerConn, userProvider.getCurrent(), job); 594 } finally { 595 peerConn.close(); 596 } 597 } catch (InterruptedException e) { 598 LOG.info("Interrupted obtaining user authentication token"); 599 Thread.interrupted(); 600 } 601 } 602 } 603 604 /** 605 * Writes the given scan into a Base64 encoded string. 606 * 607 * @param scan The scan to write out. 608 * @return The scan saved in a Base64 encoded string. 609 * @throws IOException When writing the scan fails. 610 */ 611 public static String convertScanToString(Scan scan) throws IOException { 612 ClientProtos.Scan proto = ProtobufUtil.toScan(scan); 613 return Bytes.toString(Base64.getEncoder().encode(proto.toByteArray())); 614 } 615 616 /** 617 * Converts the given Base64 string back into a Scan instance. 618 * 619 * @param base64 The scan details. 620 * @return The newly created Scan instance. 621 * @throws IOException When reading the scan instance fails. 622 */ 623 public static Scan convertStringToScan(String base64) throws IOException { 624 byte [] decoded = Base64.getDecoder().decode(base64); 625 return ProtobufUtil.toScan(ClientProtos.Scan.parseFrom(decoded)); 626 } 627 628 /** 629 * Use this before submitting a TableReduce job. It will 630 * appropriately set up the JobConf. 631 * 632 * @param table The output table. 633 * @param reducer The reducer class to use. 634 * @param job The current job to adjust. 635 * @throws IOException When determining the region count fails. 636 */ 637 public static void initTableReducerJob(String table, 638 Class<? extends TableReducer> reducer, Job job) 639 throws IOException { 640 initTableReducerJob(table, reducer, job, null); 641 } 642 643 /** 644 * Use this before submitting a TableReduce job. It will 645 * appropriately set up the JobConf. 646 * 647 * @param table The output table. 648 * @param reducer The reducer class to use. 649 * @param job The current job to adjust. 650 * @param partitioner Partitioner to use. Pass <code>null</code> to use 651 * default partitioner. 652 * @throws IOException When determining the region count fails. 653 */ 654 public static void initTableReducerJob(String table, 655 Class<? extends TableReducer> reducer, Job job, 656 Class partitioner) throws IOException { 657 initTableReducerJob(table, reducer, job, partitioner, null, null, null); 658 } 659 660 /** 661 * Use this before submitting a TableReduce job. It will 662 * appropriately set up the JobConf. 663 * 664 * @param table The output table. 665 * @param reducer The reducer class to use. 666 * @param job The current job to adjust. Make sure the passed job is 667 * carrying all necessary HBase configuration. 668 * @param partitioner Partitioner to use. Pass <code>null</code> to use 669 * default partitioner. 670 * @param quorumAddress Distant cluster to write to; default is null for 671 * output to the cluster that is designated in <code>hbase-site.xml</code>. 672 * Set this String to the zookeeper ensemble of an alternate remote cluster 673 * when you would have the reduce write a cluster that is other than the 674 * default; e.g. copying tables between clusters, the source would be 675 * designated by <code>hbase-site.xml</code> and this param would have the 676 * ensemble address of the remote cluster. The format to pass is particular. 677 * Pass <code> <hbase.zookeeper.quorum>:< 678 * hbase.zookeeper.client.port>:<zookeeper.znode.parent> 679 * </code> such as <code>server,server2,server3:2181:/hbase</code>. 680 * @param serverClass redefined hbase.regionserver.class 681 * @param serverImpl redefined hbase.regionserver.impl 682 * @throws IOException When determining the region count fails. 683 */ 684 public static void initTableReducerJob(String table, 685 Class<? extends TableReducer> reducer, Job job, 686 Class partitioner, String quorumAddress, String serverClass, 687 String serverImpl) throws IOException { 688 initTableReducerJob(table, reducer, job, partitioner, quorumAddress, 689 serverClass, serverImpl, true); 690 } 691 692 /** 693 * Use this before submitting a TableReduce job. It will 694 * appropriately set up the JobConf. 695 * 696 * @param table The output table. 697 * @param reducer The reducer class to use. 698 * @param job The current job to adjust. Make sure the passed job is 699 * carrying all necessary HBase configuration. 700 * @param partitioner Partitioner to use. Pass <code>null</code> to use 701 * default partitioner. 702 * @param quorumAddress Distant cluster to write to; default is null for 703 * output to the cluster that is designated in <code>hbase-site.xml</code>. 704 * Set this String to the zookeeper ensemble of an alternate remote cluster 705 * when you would have the reduce write a cluster that is other than the 706 * default; e.g. copying tables between clusters, the source would be 707 * designated by <code>hbase-site.xml</code> and this param would have the 708 * ensemble address of the remote cluster. The format to pass is particular. 709 * Pass <code> <hbase.zookeeper.quorum>:< 710 * hbase.zookeeper.client.port>:<zookeeper.znode.parent> 711 * </code> such as <code>server,server2,server3:2181:/hbase</code>. 712 * @param serverClass redefined hbase.regionserver.class 713 * @param serverImpl redefined hbase.regionserver.impl 714 * @param addDependencyJars upload HBase jars and jars for any of the configured 715 * job classes via the distributed cache (tmpjars). 716 * @throws IOException When determining the region count fails. 717 */ 718 public static void initTableReducerJob(String table, 719 Class<? extends TableReducer> reducer, Job job, 720 Class partitioner, String quorumAddress, String serverClass, 721 String serverImpl, boolean addDependencyJars) throws IOException { 722 723 Configuration conf = job.getConfiguration(); 724 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); 725 job.setOutputFormatClass(TableOutputFormat.class); 726 if (reducer != null) job.setReducerClass(reducer); 727 conf.set(TableOutputFormat.OUTPUT_TABLE, table); 728 conf.setStrings("io.serializations", conf.get("io.serializations"), 729 MutationSerialization.class.getName(), ResultSerialization.class.getName()); 730 // If passed a quorum/ensemble address, pass it on to TableOutputFormat. 731 if (quorumAddress != null) { 732 // Calling this will validate the format 733 ZKConfig.validateClusterKey(quorumAddress); 734 conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress); 735 } 736 if (serverClass != null && serverImpl != null) { 737 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass); 738 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl); 739 } 740 job.setOutputKeyClass(ImmutableBytesWritable.class); 741 job.setOutputValueClass(Writable.class); 742 if (partitioner == HRegionPartitioner.class) { 743 job.setPartitionerClass(HRegionPartitioner.class); 744 int regions = MetaTableAccessor.getRegionCount(conf, TableName.valueOf(table)); 745 if (job.getNumReduceTasks() > regions) { 746 job.setNumReduceTasks(regions); 747 } 748 } else if (partitioner != null) { 749 job.setPartitionerClass(partitioner); 750 } 751 752 if (addDependencyJars) { 753 addDependencyJars(job); 754 } 755 756 initCredentials(job); 757 } 758 759 /** 760 * Ensures that the given number of reduce tasks for the given job 761 * configuration does not exceed the number of regions for the given table. 762 * 763 * @param table The table to get the region count for. 764 * @param job The current job to adjust. 765 * @throws IOException When retrieving the table details fails. 766 */ 767 public static void limitNumReduceTasks(String table, Job job) 768 throws IOException { 769 int regions = 770 MetaTableAccessor.getRegionCount(job.getConfiguration(), TableName.valueOf(table)); 771 if (job.getNumReduceTasks() > regions) 772 job.setNumReduceTasks(regions); 773 } 774 775 /** 776 * Sets the number of reduce tasks for the given job configuration to the 777 * number of regions the given table has. 778 * 779 * @param table The table to get the region count for. 780 * @param job The current job to adjust. 781 * @throws IOException When retrieving the table details fails. 782 */ 783 public static void setNumReduceTasks(String table, Job job) 784 throws IOException { 785 job.setNumReduceTasks(MetaTableAccessor.getRegionCount(job.getConfiguration(), 786 TableName.valueOf(table))); 787 } 788 789 /** 790 * Sets the number of rows to return and cache with each scanner iteration. 791 * Higher caching values will enable faster mapreduce jobs at the expense of 792 * requiring more heap to contain the cached rows. 793 * 794 * @param job The current job to adjust. 795 * @param batchSize The number of rows to return in batch with each scanner 796 * iteration. 797 */ 798 public static void setScannerCaching(Job job, int batchSize) { 799 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize); 800 } 801 802 /** 803 * Add HBase and its dependencies (only) to the job configuration. 804 * <p> 805 * This is intended as a low-level API, facilitating code reuse between this 806 * class and its mapred counterpart. It also of use to external tools that 807 * need to build a MapReduce job that interacts with HBase but want 808 * fine-grained control over the jars shipped to the cluster. 809 * </p> 810 * @param conf The Configuration object to extend with dependencies. 811 * @see org.apache.hadoop.hbase.mapred.TableMapReduceUtil 812 * @see <a href="https://issues.apache.org/jira/browse/PIG-3285">PIG-3285</a> 813 */ 814 public static void addHBaseDependencyJars(Configuration conf) throws IOException { 815 addDependencyJarsForClasses(conf, 816 // explicitly pull a class from each module 817 org.apache.hadoop.hbase.HConstants.class, // hbase-common 818 org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class, // hbase-protocol 819 org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.class, // hbase-protocol-shaded 820 org.apache.hadoop.hbase.client.Put.class, // hbase-client 821 org.apache.hadoop.hbase.ipc.RpcServer.class, // hbase-server 822 org.apache.hadoop.hbase.CompatibilityFactory.class, // hbase-hadoop-compat 823 org.apache.hadoop.hbase.mapreduce.JobUtil.class, // hbase-hadoop2-compat 824 org.apache.hadoop.hbase.mapreduce.TableMapper.class, // hbase-mapreduce 825 org.apache.hadoop.hbase.metrics.impl.FastLongHistogram.class, // hbase-metrics 826 org.apache.hadoop.hbase.metrics.Snapshot.class, // hbase-metrics-api 827 org.apache.hadoop.hbase.replication.ReplicationUtils.class, // hbase-replication 828 org.apache.hadoop.hbase.http.HttpServer.class, // hbase-http 829 org.apache.hadoop.hbase.procedure2.Procedure.class, // hbase-procedure 830 org.apache.hadoop.hbase.zookeeper.ZKWatcher.class, // hbase-zookeeper 831 org.apache.hbase.thirdparty.com.google.common.collect.Lists.class, // hb-shaded-miscellaneous 832 org.apache.hbase.thirdparty.com.google.gson.GsonBuilder.class, // hbase-shaded-gson 833 org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations.class, // hb-sh-protobuf 834 org.apache.hbase.thirdparty.io.netty.channel.Channel.class, // hbase-shaded-netty 835 org.apache.zookeeper.ZooKeeper.class, // zookeeper 836 com.google.protobuf.Message.class, // protobuf 837 org.apache.htrace.core.Tracer.class, // htrace 838 com.codahale.metrics.MetricRegistry.class, // metrics-core 839 org.apache.commons.lang3.ArrayUtils.class); // commons-lang 840 } 841 842 /** 843 * Returns a classpath string built from the content of the "tmpjars" value in {@code conf}. 844 * Also exposed to shell scripts via `bin/hbase mapredcp`. 845 */ 846 public static String buildDependencyClasspath(Configuration conf) { 847 if (conf == null) { 848 throw new IllegalArgumentException("Must provide a configuration object."); 849 } 850 Set<String> paths = new HashSet<>(conf.getStringCollection("tmpjars")); 851 if (paths.isEmpty()) { 852 throw new IllegalArgumentException("Configuration contains no tmpjars."); 853 } 854 StringBuilder sb = new StringBuilder(); 855 for (String s : paths) { 856 // entries can take the form 'file:/path/to/file.jar'. 857 int idx = s.indexOf(":"); 858 if (idx != -1) s = s.substring(idx + 1); 859 if (sb.length() > 0) sb.append(File.pathSeparator); 860 sb.append(s); 861 } 862 return sb.toString(); 863 } 864 865 /** 866 * Add the HBase dependency jars as well as jars for any of the configured 867 * job classes to the job configuration, so that JobClient will ship them 868 * to the cluster and add them to the DistributedCache. 869 */ 870 public static void addDependencyJars(Job job) throws IOException { 871 addHBaseDependencyJars(job.getConfiguration()); 872 try { 873 addDependencyJarsForClasses(job.getConfiguration(), 874 // when making changes here, consider also mapred.TableMapReduceUtil 875 // pull job classes 876 job.getMapOutputKeyClass(), 877 job.getMapOutputValueClass(), 878 job.getInputFormatClass(), 879 job.getOutputKeyClass(), 880 job.getOutputValueClass(), 881 job.getOutputFormatClass(), 882 job.getPartitionerClass(), 883 job.getCombinerClass()); 884 } catch (ClassNotFoundException e) { 885 throw new IOException(e); 886 } 887 } 888 889 /** 890 * Add the jars containing the given classes to the job's configuration 891 * such that JobClient will ship them to the cluster and add them to 892 * the DistributedCache. 893 * @deprecated since 1.3.0 and will be removed in 3.0.0. Use {@link #addDependencyJars(Job)} 894 * instead. 895 * @see #addDependencyJars(Job) 896 * @see <a href="https://issues.apache.org/jira/browse/HBASE-8386">HBASE-8386</a> 897 */ 898 @Deprecated 899 public static void addDependencyJars(Configuration conf, 900 Class<?>... classes) throws IOException { 901 LOG.warn("The addDependencyJars(Configuration, Class<?>...) method has been deprecated since it" 902 + " is easy to use incorrectly. Most users should rely on addDependencyJars(Job) " + 903 "instead. See HBASE-8386 for more details."); 904 addDependencyJarsForClasses(conf, classes); 905 } 906 907 /** 908 * Add the jars containing the given classes to the job's configuration 909 * such that JobClient will ship them to the cluster and add them to 910 * the DistributedCache. 911 * 912 * N.B. that this method at most adds one jar per class given. If there is more than one 913 * jar available containing a class with the same name as a given class, we don't define 914 * which of those jars might be chosen. 915 * 916 * @param conf The Hadoop Configuration to modify 917 * @param classes will add just those dependencies needed to find the given classes 918 * @throws IOException if an underlying library call fails. 919 */ 920 @InterfaceAudience.Private 921 public static void addDependencyJarsForClasses(Configuration conf, 922 Class<?>... classes) throws IOException { 923 924 FileSystem localFs = FileSystem.getLocal(conf); 925 Set<String> jars = new HashSet<>(); 926 // Add jars that are already in the tmpjars variable 927 jars.addAll(conf.getStringCollection("tmpjars")); 928 929 // add jars as we find them to a map of contents jar name so that we can avoid 930 // creating new jars for classes that have already been packaged. 931 Map<String, String> packagedClasses = new HashMap<>(); 932 933 // Add jars containing the specified classes 934 for (Class<?> clazz : classes) { 935 if (clazz == null) continue; 936 937 Path path = findOrCreateJar(clazz, localFs, packagedClasses); 938 if (path == null) { 939 LOG.warn("Could not find jar for class " + clazz + 940 " in order to ship it to the cluster."); 941 continue; 942 } 943 if (!localFs.exists(path)) { 944 LOG.warn("Could not validate jar file " + path + " for class " 945 + clazz); 946 continue; 947 } 948 jars.add(path.toString()); 949 } 950 if (jars.isEmpty()) return; 951 952 conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()]))); 953 } 954 955 /** 956 * Finds the Jar for a class or creates it if it doesn't exist. If the class is in 957 * a directory in the classpath, it creates a Jar on the fly with the 958 * contents of the directory and returns the path to that Jar. If a Jar is 959 * created, it is created in the system temporary directory. Otherwise, 960 * returns an existing jar that contains a class of the same name. Maintains 961 * a mapping from jar contents to the tmp jar created. 962 * @param my_class the class to find. 963 * @param fs the FileSystem with which to qualify the returned path. 964 * @param packagedClasses a map of class name to path. 965 * @return a jar file that contains the class. 966 * @throws IOException 967 */ 968 private static Path findOrCreateJar(Class<?> my_class, FileSystem fs, 969 Map<String, String> packagedClasses) 970 throws IOException { 971 // attempt to locate an existing jar for the class. 972 String jar = findContainingJar(my_class, packagedClasses); 973 if (null == jar || jar.isEmpty()) { 974 jar = getJar(my_class); 975 updateMap(jar, packagedClasses); 976 } 977 978 if (null == jar || jar.isEmpty()) { 979 return null; 980 } 981 982 LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar)); 983 return new Path(jar).makeQualified(fs.getUri(), fs.getWorkingDirectory()); 984 } 985 986 /** 987 * Add entries to <code>packagedClasses</code> corresponding to class files 988 * contained in <code>jar</code>. 989 * @param jar The jar who's content to list. 990 * @param packagedClasses map[class -> jar] 991 */ 992 private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException { 993 if (null == jar || jar.isEmpty()) { 994 return; 995 } 996 ZipFile zip = null; 997 try { 998 zip = new ZipFile(jar); 999 for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) { 1000 ZipEntry entry = iter.nextElement(); 1001 if (entry.getName().endsWith("class")) { 1002 packagedClasses.put(entry.getName(), jar); 1003 } 1004 } 1005 } finally { 1006 if (null != zip) zip.close(); 1007 } 1008 } 1009 1010 /** 1011 * Find a jar that contains a class of the same name, if any. It will return 1012 * a jar file, even if that is not the first thing on the class path that 1013 * has a class with the same name. Looks first on the classpath and then in 1014 * the <code>packagedClasses</code> map. 1015 * @param my_class the class to find. 1016 * @return a jar file that contains the class, or null. 1017 * @throws IOException 1018 */ 1019 private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses) 1020 throws IOException { 1021 ClassLoader loader = my_class.getClassLoader(); 1022 1023 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class"; 1024 1025 if (loader != null) { 1026 // first search the classpath 1027 for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) { 1028 URL url = itr.nextElement(); 1029 if ("jar".equals(url.getProtocol())) { 1030 String toReturn = url.getPath(); 1031 if (toReturn.startsWith("file:")) { 1032 toReturn = toReturn.substring("file:".length()); 1033 } 1034 // URLDecoder is a misnamed class, since it actually decodes 1035 // x-www-form-urlencoded MIME type rather than actual 1036 // URL encoding (which the file path has). Therefore it would 1037 // decode +s to ' 's which is incorrect (spaces are actually 1038 // either unencoded or encoded as "%20"). Replace +s first, so 1039 // that they are kept sacred during the decoding process. 1040 toReturn = toReturn.replaceAll("\\+", "%2B"); 1041 toReturn = URLDecoder.decode(toReturn, "UTF-8"); 1042 return toReturn.replaceAll("!.*$", ""); 1043 } 1044 } 1045 } 1046 1047 // now look in any jars we've packaged using JarFinder. Returns null when 1048 // no jar is found. 1049 return packagedClasses.get(class_file); 1050 } 1051 1052 /** 1053 * Invoke 'getJar' on a custom JarFinder implementation. Useful for some job 1054 * configuration contexts (HBASE-8140) and also for testing on MRv2. 1055 * check if we have HADOOP-9426. 1056 * @param my_class the class to find. 1057 * @return a jar file that contains the class, or null. 1058 */ 1059 private static String getJar(Class<?> my_class) { 1060 String ret = null; 1061 try { 1062 ret = JarFinder.getJar(my_class); 1063 } catch (Exception e) { 1064 // toss all other exceptions, related to reflection failure 1065 throw new RuntimeException("getJar invocation failed.", e); 1066 } 1067 1068 return ret; 1069 } 1070}