001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.Closeable; 022import java.io.IOException; 023import java.net.InetAddress; 024import java.net.InetSocketAddress; 025import java.net.UnknownHostException; 026import java.util.ArrayList; 027import java.util.HashMap; 028import java.util.List; 029import org.apache.hadoop.hbase.HConstants; 030import org.apache.hadoop.hbase.HRegionLocation; 031import org.apache.hadoop.hbase.TableName; 032import org.apache.hadoop.hbase.client.Admin; 033import org.apache.hadoop.hbase.client.Connection; 034import org.apache.hadoop.hbase.client.RegionLocator; 035import org.apache.hadoop.hbase.client.Result; 036import org.apache.hadoop.hbase.client.Scan; 037import org.apache.hadoop.hbase.client.Table; 038import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; 039import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 040import org.apache.hadoop.hbase.util.Addressing; 041import org.apache.hadoop.hbase.util.Bytes; 042import org.apache.hadoop.hbase.util.Pair; 043import org.apache.hadoop.hbase.util.Strings; 044import org.apache.hadoop.mapreduce.InputFormat; 045import org.apache.hadoop.mapreduce.InputSplit; 046import org.apache.hadoop.mapreduce.JobContext; 047import org.apache.hadoop.mapreduce.RecordReader; 048import org.apache.hadoop.mapreduce.TaskAttemptContext; 049import org.apache.hadoop.net.DNS; 050import org.apache.hadoop.util.StringUtils; 051import org.apache.yetus.audience.InterfaceAudience; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 055 056/** 057 * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName}, 058 * an {@link Scan} instance that defines the input columns etc. Subclasses may use 059 * other TableRecordReader implementations. 060 * 061 * Subclasses MUST ensure initializeTable(Connection, TableName) is called for an instance to 062 * function properly. Each of the entry points to this class used by the MapReduce framework, 063 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)}, 064 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle 065 * retrieving the necessary configuration information. If your subclass overrides either of these 066 * methods, either call the parent version or call initialize yourself. 067 * 068 * <p> 069 * An example of a subclass: 070 * <pre> 071 * class ExampleTIF extends TableInputFormatBase { 072 * 073 * {@literal @}Override 074 * protected void initialize(JobContext context) throws IOException { 075 * // We are responsible for the lifecycle of this connection until we hand it over in 076 * // initializeTable. 077 * Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create( 078 * job.getConfiguration())); 079 * TableName tableName = TableName.valueOf("exampleTable"); 080 * // mandatory. once passed here, TableInputFormatBase will handle closing the connection. 081 * initializeTable(connection, tableName); 082 * byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"), 083 * Bytes.toBytes("columnB") }; 084 * // optional, by default we'll get everything for the table. 085 * Scan scan = new Scan(); 086 * for (byte[] family : inputColumns) { 087 * scan.addFamily(family); 088 * } 089 * Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*")); 090 * scan.setFilter(exampleFilter); 091 * setScan(scan); 092 * } 093 * } 094 * </pre> 095 * 096 * 097 * The number of InputSplits(mappers) match the number of regions in a table by default. 098 * Set "hbase.mapreduce.tableinput.mappers.per.region" to specify how many mappers per region, set 099 * this property will disable autobalance below.\ 100 * Set "hbase.mapreduce.tif.input.autobalance" to enable autobalance, hbase will assign mappers 101 * based on average region size; For regions, whose size larger than average region size may assigned 102 * more mappers, and for smaller one, they may group together to use one mapper. If actual average 103 * region size is too big, like 50G, it is not good to only assign 1 mapper for those large regions. 104 * Use "hbase.mapreduce.tif.ave.regionsize" to set max average region size when enable "autobalanece", 105 * default mas average region size is 8G. 106 */ 107@InterfaceAudience.Public 108public abstract class TableInputFormatBase 109 extends InputFormat<ImmutableBytesWritable, Result> { 110 111 private static final Logger LOG = LoggerFactory.getLogger(TableInputFormatBase.class); 112 113 private static final String NOT_INITIALIZED = "The input format instance has not been properly " + 114 "initialized. Ensure you call initializeTable either in your constructor or initialize " + 115 "method"; 116 private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" + 117 " previous error. Please look at the previous logs lines from" + 118 " the task's full log for more details."; 119 120 /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */ 121 public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.tif.input.autobalance"; 122 /** In auto-balance, we split input by ave region size, if calculated region size is too big, we can set it. */ 123 public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.tif.ave.regionsize"; 124 125 /** Set the number of Mappers for each region, all regions have same number of Mappers */ 126 public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.tableinput.mappers.per.region"; 127 128 129 /** Holds the details for the internal scanner. 130 * 131 * @see Scan */ 132 private Scan scan = null; 133 /** The {@link Admin}. */ 134 private Admin admin; 135 /** The {@link Table} to scan. */ 136 private Table table; 137 /** The {@link RegionLocator} of the table. */ 138 private RegionLocator regionLocator; 139 /** The reader scanning the table, can be a custom one. */ 140 private TableRecordReader tableRecordReader = null; 141 /** The underlying {@link Connection} of the table. */ 142 private Connection connection; 143 144 145 /** The reverse DNS lookup cache mapping: IPAddress => HostName */ 146 private HashMap<InetAddress, String> reverseDNSCacheMap = 147 new HashMap<>(); 148 149 /** 150 * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses 151 * the default. 152 * 153 * @param split The split to work with. 154 * @param context The current context. 155 * @return The newly created record reader. 156 * @throws IOException When creating the reader fails. 157 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader( 158 * org.apache.hadoop.mapreduce.InputSplit, 159 * org.apache.hadoop.mapreduce.TaskAttemptContext) 160 */ 161 @Override 162 public RecordReader<ImmutableBytesWritable, Result> createRecordReader( 163 InputSplit split, TaskAttemptContext context) 164 throws IOException { 165 // Just in case a subclass is relying on JobConfigurable magic. 166 if (table == null) { 167 initialize(context); 168 } 169 // null check in case our child overrides getTable to not throw. 170 try { 171 if (getTable() == null) { 172 // initialize() must not have been implemented in the subclass. 173 throw new IOException(INITIALIZATION_ERROR); 174 } 175 } catch (IllegalStateException exception) { 176 throw new IOException(INITIALIZATION_ERROR, exception); 177 } 178 TableSplit tSplit = (TableSplit) split; 179 LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes."); 180 final TableRecordReader trr = 181 this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader(); 182 Scan sc = new Scan(this.scan); 183 sc.setStartRow(tSplit.getStartRow()); 184 sc.setStopRow(tSplit.getEndRow()); 185 trr.setScan(sc); 186 trr.setTable(getTable()); 187 return new RecordReader<ImmutableBytesWritable, Result>() { 188 189 @Override 190 public void close() throws IOException { 191 trr.close(); 192 closeTable(); 193 } 194 195 @Override 196 public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { 197 return trr.getCurrentKey(); 198 } 199 200 @Override 201 public Result getCurrentValue() throws IOException, InterruptedException { 202 return trr.getCurrentValue(); 203 } 204 205 @Override 206 public float getProgress() throws IOException, InterruptedException { 207 return trr.getProgress(); 208 } 209 210 @Override 211 public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, 212 InterruptedException { 213 trr.initialize(inputsplit, context); 214 } 215 216 @Override 217 public boolean nextKeyValue() throws IOException, InterruptedException { 218 return trr.nextKeyValue(); 219 } 220 }; 221 } 222 223 protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException { 224 return getRegionLocator().getStartEndKeys(); 225 } 226 227 /** 228 * Calculates the splits that will serve as input for the map tasks. 229 * @param context The current job context. 230 * @return The list of input splits. 231 * @throws IOException When creating the list of splits fails. 232 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( 233 * org.apache.hadoop.mapreduce.JobContext) 234 */ 235 @Override 236 public List<InputSplit> getSplits(JobContext context) throws IOException { 237 boolean closeOnFinish = false; 238 239 // Just in case a subclass is relying on JobConfigurable magic. 240 if (table == null) { 241 initialize(context); 242 closeOnFinish = true; 243 } 244 245 // null check in case our child overrides getTable to not throw. 246 try { 247 if (getTable() == null) { 248 // initialize() must not have been implemented in the subclass. 249 throw new IOException(INITIALIZATION_ERROR); 250 } 251 } catch (IllegalStateException exception) { 252 throw new IOException(INITIALIZATION_ERROR, exception); 253 } 254 255 try { 256 List<InputSplit> splits = oneInputSplitPerRegion(); 257 258 // set same number of mappers for each region 259 if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) { 260 int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1); 261 List<InputSplit> res = new ArrayList<>(); 262 for (int i = 0; i < splits.size(); i++) { 263 List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion); 264 res.addAll(tmp); 265 } 266 return res; 267 } 268 269 //The default value of "hbase.mapreduce.input.autobalance" is false. 270 if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) { 271 long maxAveRegionSize = context.getConfiguration() 272 .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824); //8GB 273 return calculateAutoBalancedSplits(splits, maxAveRegionSize); 274 } 275 276 // return one mapper per region 277 return splits; 278 } finally { 279 if (closeOnFinish) { 280 closeTable(); 281 } 282 } 283 } 284 285 /** 286 * Create one InputSplit per region 287 * 288 * @return The list of InputSplit for all the regions 289 * @throws IOException throws IOException 290 */ 291 private List<InputSplit> oneInputSplitPerRegion() throws IOException { 292 RegionSizeCalculator sizeCalculator = 293 createRegionSizeCalculator(getRegionLocator(), getAdmin()); 294 295 TableName tableName = getTable().getName(); 296 297 Pair<byte[][], byte[][]> keys = getStartEndKeys(); 298 if (keys == null || keys.getFirst() == null || 299 keys.getFirst().length == 0) { 300 HRegionLocation regLoc = 301 getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false); 302 if (null == regLoc) { 303 throw new IOException("Expecting at least one region."); 304 } 305 List<InputSplit> splits = new ArrayList<>(1); 306 long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName()); 307 // In the table input format for single table we do not need to 308 // store the scan object in table split because it can be memory intensive and redundant 309 // information to what is already stored in conf SCAN. See HBASE-25212 310 TableSplit split = new TableSplit(tableName, null, 311 HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc 312 .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize); 313 splits.add(split); 314 return splits; 315 } 316 List<InputSplit> splits = new ArrayList<>(keys.getFirst().length); 317 for (int i = 0; i < keys.getFirst().length; i++) { 318 if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) { 319 continue; 320 } 321 322 byte[] startRow = scan.getStartRow(); 323 byte[] stopRow = scan.getStopRow(); 324 // determine if the given start an stop key fall into the region 325 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || 326 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && 327 (stopRow.length == 0 || 328 Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) { 329 byte[] splitStart = startRow.length == 0 || 330 Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? 331 keys.getFirst()[i] : startRow; 332 byte[] splitStop = (stopRow.length == 0 || 333 Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && 334 keys.getSecond()[i].length > 0 ? 335 keys.getSecond()[i] : stopRow; 336 337 HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false); 338 // The below InetSocketAddress creation does a name resolution. 339 InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort()); 340 if (isa.isUnresolved()) { 341 LOG.warn("Failed resolve " + isa); 342 } 343 InetAddress regionAddress = isa.getAddress(); 344 String regionLocation; 345 regionLocation = reverseDNS(regionAddress); 346 347 byte[] regionName = location.getRegionInfo().getRegionName(); 348 String encodedRegionName = location.getRegionInfo().getEncodedName(); 349 long regionSize = sizeCalculator.getRegionSize(regionName); 350 // In the table input format for single table we do not need to 351 // store the scan object in table split because it can be memory intensive and redundant 352 // information to what is already stored in conf SCAN. See HBASE-25212 353 TableSplit split = new TableSplit(tableName, null, 354 splitStart, splitStop, regionLocation, encodedRegionName, regionSize); 355 splits.add(split); 356 if (LOG.isDebugEnabled()) { 357 LOG.debug("getSplits: split -> " + i + " -> " + split); 358 } 359 } 360 } 361 return splits; 362 } 363 364 /** 365 * Create n splits for one InputSplit, For now only support uniform distribution 366 * @param split A TableSplit corresponding to a range of rowkeys 367 * @param n Number of ranges after splitting. Pass 1 means no split for the range 368 * Pass 2 if you want to split the range in two; 369 * @return A list of TableSplit, the size of the list is n 370 * @throws IllegalArgumentIOException throws IllegalArgumentIOException 371 */ 372 protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n) 373 throws IllegalArgumentIOException { 374 if (split == null || !(split instanceof TableSplit)) { 375 throw new IllegalArgumentIOException( 376 "InputSplit for CreateNSplitsPerRegion can not be null + " 377 + "and should be instance of TableSplit"); 378 } 379 //if n < 1, then still continue using n = 1 380 n = n < 1 ? 1 : n; 381 List<InputSplit> res = new ArrayList<>(n); 382 if (n == 1) { 383 res.add(split); 384 return res; 385 } 386 387 // Collect Region related information 388 TableSplit ts = (TableSplit) split; 389 TableName tableName = ts.getTable(); 390 String regionLocation = ts.getRegionLocation(); 391 String encodedRegionName = ts.getEncodedRegionName(); 392 long regionSize = ts.getLength(); 393 byte[] startRow = ts.getStartRow(); 394 byte[] endRow = ts.getEndRow(); 395 396 // For special case: startRow or endRow is empty 397 if (startRow.length == 0 && endRow.length == 0){ 398 startRow = new byte[1]; 399 endRow = new byte[1]; 400 startRow[0] = 0; 401 endRow[0] = -1; 402 } 403 if (startRow.length == 0 && endRow.length != 0){ 404 startRow = new byte[1]; 405 startRow[0] = 0; 406 } 407 if (startRow.length != 0 && endRow.length == 0){ 408 endRow =new byte[startRow.length]; 409 for (int k = 0; k < startRow.length; k++){ 410 endRow[k] = -1; 411 } 412 } 413 414 // Split Region into n chunks evenly 415 byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1); 416 for (int i = 0; i < splitKeys.length - 1; i++) { 417 // In the table input format for single table we do not need to 418 // store the scan object in table split because it can be memory intensive and redundant 419 // information to what is already stored in conf SCAN. See HBASE-25212 420 //notice that the regionSize parameter may be not very accurate 421 TableSplit tsplit = 422 new TableSplit(tableName, null, splitKeys[i], splitKeys[i + 1], regionLocation, 423 encodedRegionName, regionSize / n); 424 res.add(tsplit); 425 } 426 return res; 427 } 428 /** 429 * Calculates the number of MapReduce input splits for the map tasks. The number of 430 * MapReduce input splits depends on the average region size. 431 * Make it 'public' for testing 432 * 433 * @param splits The list of input splits before balance. 434 * @param maxAverageRegionSize max Average region size for one mapper 435 * @return The list of input splits. 436 * @throws IOException When creating the list of splits fails. 437 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( 438 *org.apache.hadoop.mapreduce.JobContext) 439 */ 440 public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize) 441 throws IOException { 442 if (splits.size() == 0) { 443 return splits; 444 } 445 List<InputSplit> resultList = new ArrayList<>(); 446 long totalRegionSize = 0; 447 for (int i = 0; i < splits.size(); i++) { 448 TableSplit ts = (TableSplit) splits.get(i); 449 totalRegionSize += ts.getLength(); 450 } 451 long averageRegionSize = totalRegionSize / splits.size(); 452 // totalRegionSize might be overflow, and the averageRegionSize must be positive. 453 if (averageRegionSize <= 0) { 454 LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " + 455 "set it to Long.MAX_VALUE " + splits.size()); 456 averageRegionSize = Long.MAX_VALUE / splits.size(); 457 } 458 //if averageRegionSize is too big, change it to default as 1 GB, 459 if (averageRegionSize > maxAverageRegionSize) { 460 averageRegionSize = maxAverageRegionSize; 461 } 462 // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large' region 463 // set default as 16M = (default hdfs block size) / 4; 464 if (averageRegionSize < 16 * 1048576) { 465 return splits; 466 } 467 for (int i = 0; i < splits.size(); i++) { 468 TableSplit ts = (TableSplit) splits.get(i); 469 TableName tableName = ts.getTable(); 470 String regionLocation = ts.getRegionLocation(); 471 String encodedRegionName = ts.getEncodedRegionName(); 472 long regionSize = ts.getLength(); 473 474 if (regionSize >= averageRegionSize) { 475 // make this region as multiple MapReduce input split. 476 int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0); 477 List<InputSplit> temp = createNInputSplitsUniform(ts, n); 478 resultList.addAll(temp); 479 } else { 480 // if the total size of several small continuous regions less than the average region size, 481 // combine them into one MapReduce input split. 482 long totalSize = regionSize; 483 byte[] splitStartKey = ts.getStartRow(); 484 byte[] splitEndKey = ts.getEndRow(); 485 int j = i + 1; 486 while (j < splits.size()) { 487 TableSplit nextRegion = (TableSplit) splits.get(j); 488 long nextRegionSize = nextRegion.getLength(); 489 if (totalSize + nextRegionSize <= averageRegionSize 490 && Bytes.equals(splitEndKey, nextRegion.getStartRow())) { 491 totalSize = totalSize + nextRegionSize; 492 splitEndKey = nextRegion.getEndRow(); 493 j++; 494 } else { 495 break; 496 } 497 } 498 i = j - 1; 499 // In the table input format for single table we do not need to 500 // store the scan object in table split because it can be memory intensive and redundant 501 // information to what is already stored in conf SCAN. See HBASE-25212 502 TableSplit t = new TableSplit(tableName, null, splitStartKey, splitEndKey, regionLocation, 503 encodedRegionName, totalSize); 504 resultList.add(t); 505 } 506 } 507 return resultList; 508 } 509 510 String reverseDNS(InetAddress ipAddress) throws UnknownHostException { 511 String hostName = this.reverseDNSCacheMap.get(ipAddress); 512 if (hostName == null) { 513 String ipAddressString = null; 514 try { 515 ipAddressString = DNS.reverseDns(ipAddress, null); 516 } catch (Exception e) { 517 // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the 518 // name service. Also, in case of ipv6, we need to use the InetAddress since resolving 519 // reverse DNS using jndi doesn't work well with ipv6 addresses. 520 ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName(); 521 } 522 if (ipAddressString == null) { 523 throw new UnknownHostException("No host found for " + ipAddress); 524 } 525 hostName = Strings.domainNamePointerToHostName(ipAddressString); 526 this.reverseDNSCacheMap.put(ipAddress, hostName); 527 } 528 return hostName; 529 } 530 531 /** 532 * Test if the given region is to be included in the InputSplit while splitting 533 * the regions of a table. 534 * <p> 535 * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job, 536 * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br> 537 * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing, 538 * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys. 539 * <br> 540 * <br> 541 * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region. 542 * <br> 543 * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included). 544 * 545 * 546 * @param startKey Start key of the region 547 * @param endKey End key of the region 548 * @return true, if this region needs to be included as part of the input (default). 549 * 550 */ 551 protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) { 552 return true; 553 } 554 555 /** 556 * Allows subclasses to get the {@link RegionLocator}. 557 */ 558 protected RegionLocator getRegionLocator() { 559 if (regionLocator == null) { 560 throw new IllegalStateException(NOT_INITIALIZED); 561 } 562 return regionLocator; 563 } 564 565 /** 566 * Allows subclasses to get the {@link Table}. 567 */ 568 protected Table getTable() { 569 if (table == null) { 570 throw new IllegalStateException(NOT_INITIALIZED); 571 } 572 return table; 573 } 574 575 /** 576 * Allows subclasses to get the {@link Admin}. 577 */ 578 protected Admin getAdmin() { 579 if (admin == null) { 580 throw new IllegalStateException(NOT_INITIALIZED); 581 } 582 return admin; 583 } 584 585 /** 586 * Allows subclasses to initialize the table information. 587 * 588 * @param connection The Connection to the HBase cluster. MUST be unmanaged. We will close. 589 * @param tableName The {@link TableName} of the table to process. 590 * @throws IOException 591 */ 592 protected void initializeTable(Connection connection, TableName tableName) throws IOException { 593 if (this.table != null || this.connection != null) { 594 LOG.warn("initializeTable called multiple times. Overwriting connection and table " + 595 "reference; TableInputFormatBase will not close these old references when done."); 596 } 597 this.table = connection.getTable(tableName); 598 this.regionLocator = connection.getRegionLocator(tableName); 599 this.admin = connection.getAdmin(); 600 this.connection = connection; 601 } 602 603 @VisibleForTesting 604 protected RegionSizeCalculator createRegionSizeCalculator(RegionLocator locator, Admin admin) 605 throws IOException { 606 return new RegionSizeCalculator(locator, admin); 607 } 608 609 /** 610 * Gets the scan defining the actual details like columns etc. 611 * 612 * @return The internal scan instance. 613 */ 614 public Scan getScan() { 615 if (this.scan == null) this.scan = new Scan(); 616 return scan; 617 } 618 619 /** 620 * Sets the scan defining the actual details like columns etc. 621 * 622 * @param scan The scan to set. 623 */ 624 public void setScan(Scan scan) { 625 this.scan = scan; 626 } 627 628 /** 629 * Allows subclasses to set the {@link TableRecordReader}. 630 * 631 * @param tableRecordReader A different {@link TableRecordReader} 632 * implementation. 633 */ 634 protected void setTableRecordReader(TableRecordReader tableRecordReader) { 635 this.tableRecordReader = tableRecordReader; 636 } 637 638 /** 639 * Handle subclass specific set up. 640 * Each of the entry points used by the MapReduce framework, 641 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)}, 642 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle 643 * retrieving the necessary configuration information and calling 644 * {@link #initializeTable(Connection, TableName)}. 645 * 646 * Subclasses should implement their initialize call such that it is safe to call multiple times. 647 * The current TableInputFormatBase implementation relies on a non-null table reference to decide 648 * if an initialize call is needed, but this behavior may change in the future. In particular, 649 * it is critical that initializeTable not be called multiple times since this will leak 650 * Connection instances. 651 * 652 */ 653 protected void initialize(JobContext context) throws IOException { 654 } 655 656 /** 657 * Close the Table and related objects that were initialized via 658 * {@link #initializeTable(Connection, TableName)}. 659 * 660 * @throws IOException 661 */ 662 protected void closeTable() throws IOException { 663 close(admin, table, regionLocator, connection); 664 admin = null; 665 table = null; 666 regionLocator = null; 667 connection = null; 668 } 669 670 private void close(Closeable... closables) throws IOException { 671 for (Closeable c : closables) { 672 if(c != null) { c.close(); } 673 } 674 } 675 676}