001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.mapreduce;
020
021import java.io.Closeable;
022import java.io.IOException;
023import java.net.InetAddress;
024import java.net.InetSocketAddress;
025import java.net.UnknownHostException;
026import java.util.ArrayList;
027import java.util.HashMap;
028import java.util.List;
029import org.apache.hadoop.hbase.HConstants;
030import org.apache.hadoop.hbase.HRegionLocation;
031import org.apache.hadoop.hbase.TableName;
032import org.apache.hadoop.hbase.client.Admin;
033import org.apache.hadoop.hbase.client.Connection;
034import org.apache.hadoop.hbase.client.RegionLocator;
035import org.apache.hadoop.hbase.client.Result;
036import org.apache.hadoop.hbase.client.Scan;
037import org.apache.hadoop.hbase.client.Table;
038import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
039import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
040import org.apache.hadoop.hbase.util.Addressing;
041import org.apache.hadoop.hbase.util.Bytes;
042import org.apache.hadoop.hbase.util.Pair;
043import org.apache.hadoop.hbase.util.Strings;
044import org.apache.hadoop.mapreduce.InputFormat;
045import org.apache.hadoop.mapreduce.InputSplit;
046import org.apache.hadoop.mapreduce.JobContext;
047import org.apache.hadoop.mapreduce.RecordReader;
048import org.apache.hadoop.mapreduce.TaskAttemptContext;
049import org.apache.hadoop.net.DNS;
050import org.apache.hadoop.util.StringUtils;
051import org.apache.yetus.audience.InterfaceAudience;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
055
056/**
057 * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName},
058 * an {@link Scan} instance that defines the input columns etc. Subclasses may use
059 * other TableRecordReader implementations.
060 *
061 * Subclasses MUST ensure initializeTable(Connection, TableName) is called for an instance to
062 * function properly. Each of the entry points to this class used by the MapReduce framework,
063 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
064 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
065 * retrieving the necessary configuration information. If your subclass overrides either of these
066 * methods, either call the parent version or call initialize yourself.
067 *
068 * <p>
069 * An example of a subclass:
070 * <pre>
071 *   class ExampleTIF extends TableInputFormatBase {
072 *
073 *     {@literal @}Override
074 *     protected void initialize(JobContext context) throws IOException {
075 *       // We are responsible for the lifecycle of this connection until we hand it over in
076 *       // initializeTable.
077 *       Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create(
078 *              job.getConfiguration()));
079 *       TableName tableName = TableName.valueOf("exampleTable");
080 *       // mandatory. once passed here, TableInputFormatBase will handle closing the connection.
081 *       initializeTable(connection, tableName);
082 *       byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"),
083 *         Bytes.toBytes("columnB") };
084 *       // optional, by default we'll get everything for the table.
085 *       Scan scan = new Scan();
086 *       for (byte[] family : inputColumns) {
087 *         scan.addFamily(family);
088 *       }
089 *       Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*"));
090 *       scan.setFilter(exampleFilter);
091 *       setScan(scan);
092 *     }
093 *   }
094 * </pre>
095 *
096 *
097 * The number of InputSplits(mappers) match the number of regions in a table by default.
098 * Set "hbase.mapreduce.tableinput.mappers.per.region" to specify how many mappers per region, set
099 * this property will disable autobalance below.\
100 * Set "hbase.mapreduce.tif.input.autobalance" to enable autobalance, hbase will assign mappers
101 * based on average region size; For regions, whose size larger than average region size may assigned
102 * more mappers, and for smaller one, they may group together to use one mapper. If actual average
103 * region size is too big, like 50G, it is not good to only assign 1 mapper for those large regions.
104 * Use "hbase.mapreduce.tif.ave.regionsize" to set max average region size when enable "autobalanece",
105 * default mas average region size is 8G.
106 */
107@InterfaceAudience.Public
108public abstract class TableInputFormatBase
109    extends InputFormat<ImmutableBytesWritable, Result> {
110
111  private static final Logger LOG = LoggerFactory.getLogger(TableInputFormatBase.class);
112
113  private static final String NOT_INITIALIZED = "The input format instance has not been properly " +
114      "initialized. Ensure you call initializeTable either in your constructor or initialize " +
115      "method";
116  private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" +
117      " previous error. Please look at the previous logs lines from" +
118      " the task's full log for more details.";
119
120  /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */
121  public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.tif.input.autobalance";
122  /** In auto-balance, we split input by ave region size, if calculated region size is too big, we can set it. */
123  public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.tif.ave.regionsize";
124
125  /** Set the number of Mappers for each region, all regions have same number of Mappers */
126  public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.tableinput.mappers.per.region";
127
128
129  /** Holds the details for the internal scanner.
130   *
131   * @see Scan */
132  private Scan scan = null;
133  /** The {@link Admin}. */
134  private Admin admin;
135  /** The {@link Table} to scan. */
136  private Table table;
137  /** The {@link RegionLocator} of the table. */
138  private RegionLocator regionLocator;
139  /** The reader scanning the table, can be a custom one. */
140  private TableRecordReader tableRecordReader = null;
141  /** The underlying {@link Connection} of the table. */
142  private Connection connection;
143
144
145  /** The reverse DNS lookup cache mapping: IPAddress => HostName */
146  private HashMap<InetAddress, String> reverseDNSCacheMap =
147      new HashMap<>();
148
149  /**
150   * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses
151   * the default.
152   *
153   * @param split  The split to work with.
154   * @param context  The current context.
155   * @return The newly created record reader.
156   * @throws IOException When creating the reader fails.
157   * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
158   *   org.apache.hadoop.mapreduce.InputSplit,
159   *   org.apache.hadoop.mapreduce.TaskAttemptContext)
160   */
161  @Override
162  public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
163      InputSplit split, TaskAttemptContext context)
164      throws IOException {
165    // Just in case a subclass is relying on JobConfigurable magic.
166    if (table == null) {
167      initialize(context);
168    }
169    // null check in case our child overrides getTable to not throw.
170    try {
171      if (getTable() == null) {
172        // initialize() must not have been implemented in the subclass.
173        throw new IOException(INITIALIZATION_ERROR);
174      }
175    } catch (IllegalStateException exception) {
176      throw new IOException(INITIALIZATION_ERROR, exception);
177    }
178    TableSplit tSplit = (TableSplit) split;
179    LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
180    final TableRecordReader trr =
181        this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
182    Scan sc = new Scan(this.scan);
183    sc.setStartRow(tSplit.getStartRow());
184    sc.setStopRow(tSplit.getEndRow());
185    trr.setScan(sc);
186    trr.setTable(getTable());
187    return new RecordReader<ImmutableBytesWritable, Result>() {
188
189      @Override
190      public void close() throws IOException {
191        trr.close();
192        closeTable();
193      }
194
195      @Override
196      public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
197        return trr.getCurrentKey();
198      }
199
200      @Override
201      public Result getCurrentValue() throws IOException, InterruptedException {
202        return trr.getCurrentValue();
203      }
204
205      @Override
206      public float getProgress() throws IOException, InterruptedException {
207        return trr.getProgress();
208      }
209
210      @Override
211      public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
212          InterruptedException {
213        trr.initialize(inputsplit, context);
214      }
215
216      @Override
217      public boolean nextKeyValue() throws IOException, InterruptedException {
218        return trr.nextKeyValue();
219      }
220    };
221  }
222
223  protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException {
224    return getRegionLocator().getStartEndKeys();
225  }
226
227  /**
228   * Calculates the splits that will serve as input for the map tasks.
229   * @param context  The current job context.
230   * @return The list of input splits.
231   * @throws IOException When creating the list of splits fails.
232   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
233   *   org.apache.hadoop.mapreduce.JobContext)
234   */
235  @Override
236  public List<InputSplit> getSplits(JobContext context) throws IOException {
237    boolean closeOnFinish = false;
238
239    // Just in case a subclass is relying on JobConfigurable magic.
240    if (table == null) {
241      initialize(context);
242      closeOnFinish = true;
243    }
244
245    // null check in case our child overrides getTable to not throw.
246    try {
247      if (getTable() == null) {
248        // initialize() must not have been implemented in the subclass.
249        throw new IOException(INITIALIZATION_ERROR);
250      }
251    } catch (IllegalStateException exception) {
252      throw new IOException(INITIALIZATION_ERROR, exception);
253    }
254
255    try {
256      List<InputSplit> splits = oneInputSplitPerRegion();
257
258      // set same number of mappers for each region
259      if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) {
260        int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1);
261        List<InputSplit> res = new ArrayList<>();
262        for (int i = 0; i < splits.size(); i++) {
263          List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion);
264          res.addAll(tmp);
265        }
266        return res;
267      }
268
269      //The default value of "hbase.mapreduce.input.autobalance" is false.
270      if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) {
271        long maxAveRegionSize = context.getConfiguration()
272            .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824); //8GB
273        return calculateAutoBalancedSplits(splits, maxAveRegionSize);
274      }
275
276      // return one mapper per region
277      return splits;
278    } finally {
279      if (closeOnFinish) {
280        closeTable();
281      }
282    }
283  }
284
285  /**
286   * Create one InputSplit per region
287   *
288   * @return The list of InputSplit for all the regions
289   * @throws IOException throws IOException
290   */
291  private List<InputSplit> oneInputSplitPerRegion() throws IOException {
292    RegionSizeCalculator sizeCalculator =
293        createRegionSizeCalculator(getRegionLocator(), getAdmin());
294
295    TableName tableName = getTable().getName();
296
297    Pair<byte[][], byte[][]> keys = getStartEndKeys();
298    if (keys == null || keys.getFirst() == null ||
299        keys.getFirst().length == 0) {
300      HRegionLocation regLoc =
301          getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
302      if (null == regLoc) {
303        throw new IOException("Expecting at least one region.");
304      }
305      List<InputSplit> splits = new ArrayList<>(1);
306      long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
307      // In the table input format for single table we do not need to
308      // store the scan object in table split because it can be memory intensive and redundant
309      // information to what is already stored in conf SCAN. See HBASE-25212
310      TableSplit split = new TableSplit(tableName, null,
311          HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
312          .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
313      splits.add(split);
314      return splits;
315    }
316    List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
317    for (int i = 0; i < keys.getFirst().length; i++) {
318      if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
319        continue;
320      }
321
322      byte[] startRow = scan.getStartRow();
323      byte[] stopRow = scan.getStopRow();
324      // determine if the given start an stop key fall into the region
325      if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
326          Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
327          (stopRow.length == 0 ||
328              Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
329        byte[] splitStart = startRow.length == 0 ||
330            Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
331            keys.getFirst()[i] : startRow;
332        byte[] splitStop = (stopRow.length == 0 ||
333            Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
334            keys.getSecond()[i].length > 0 ?
335            keys.getSecond()[i] : stopRow;
336
337        HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
338        // The below InetSocketAddress creation does a name resolution.
339        InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
340        if (isa.isUnresolved()) {
341          LOG.warn("Failed resolve " + isa);
342        }
343        InetAddress regionAddress = isa.getAddress();
344        String regionLocation;
345        regionLocation = reverseDNS(regionAddress);
346
347        byte[] regionName = location.getRegionInfo().getRegionName();
348        String encodedRegionName = location.getRegionInfo().getEncodedName();
349        long regionSize = sizeCalculator.getRegionSize(regionName);
350        // In the table input format for single table we do not need to
351        // store the scan object in table split because it can be memory intensive and redundant
352        // information to what is already stored in conf SCAN. See HBASE-25212
353        TableSplit split = new TableSplit(tableName, null,
354            splitStart, splitStop, regionLocation, encodedRegionName, regionSize);
355        splits.add(split);
356        if (LOG.isDebugEnabled()) {
357          LOG.debug("getSplits: split -> " + i + " -> " + split);
358        }
359      }
360    }
361    return splits;
362  }
363
364  /**
365   * Create n splits for one InputSplit, For now only support uniform distribution
366   * @param split A TableSplit corresponding to a range of rowkeys
367   * @param n     Number of ranges after splitting.  Pass 1 means no split for the range
368   *              Pass 2 if you want to split the range in two;
369   * @return A list of TableSplit, the size of the list is n
370   * @throws IllegalArgumentIOException throws IllegalArgumentIOException
371   */
372  protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n)
373      throws IllegalArgumentIOException {
374    if (split == null || !(split instanceof TableSplit)) {
375      throw new IllegalArgumentIOException(
376          "InputSplit for CreateNSplitsPerRegion can not be null + "
377              + "and should be instance of TableSplit");
378    }
379    //if n < 1, then still continue using n = 1
380    n = n < 1 ? 1 : n;
381    List<InputSplit> res = new ArrayList<>(n);
382    if (n == 1) {
383      res.add(split);
384      return res;
385    }
386
387    // Collect Region related information
388    TableSplit ts = (TableSplit) split;
389    TableName tableName = ts.getTable();
390    String regionLocation = ts.getRegionLocation();
391    String encodedRegionName = ts.getEncodedRegionName();
392    long regionSize = ts.getLength();
393    byte[] startRow = ts.getStartRow();
394    byte[] endRow = ts.getEndRow();
395
396    // For special case: startRow or endRow is empty
397    if (startRow.length == 0 && endRow.length == 0){
398      startRow = new byte[1];
399      endRow = new byte[1];
400      startRow[0] = 0;
401      endRow[0] = -1;
402    }
403    if (startRow.length == 0 && endRow.length != 0){
404      startRow = new byte[1];
405      startRow[0] = 0;
406    }
407    if (startRow.length != 0 && endRow.length == 0){
408      endRow =new byte[startRow.length];
409      for (int k = 0; k < startRow.length; k++){
410        endRow[k] = -1;
411      }
412    }
413
414    // Split Region into n chunks evenly
415    byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1);
416    for (int i = 0; i < splitKeys.length - 1; i++) {
417      // In the table input format for single table we do not need to
418      // store the scan object in table split because it can be memory intensive and redundant
419      // information to what is already stored in conf SCAN. See HBASE-25212
420      //notice that the regionSize parameter may be not very accurate
421      TableSplit tsplit =
422          new TableSplit(tableName, null, splitKeys[i], splitKeys[i + 1], regionLocation,
423              encodedRegionName, regionSize / n);
424      res.add(tsplit);
425    }
426    return res;
427  }
428  /**
429   * Calculates the number of MapReduce input splits for the map tasks. The number of
430   * MapReduce input splits depends on the average region size.
431   * Make it 'public' for testing
432   *
433   * @param splits The list of input splits before balance.
434   * @param maxAverageRegionSize max Average region size for one mapper
435   * @return The list of input splits.
436   * @throws IOException When creating the list of splits fails.
437   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
438   *org.apache.hadoop.mapreduce.JobContext)
439   */
440  public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize)
441      throws IOException {
442    if (splits.size() == 0) {
443      return splits;
444    }
445    List<InputSplit> resultList = new ArrayList<>();
446    long totalRegionSize = 0;
447    for (int i = 0; i < splits.size(); i++) {
448      TableSplit ts = (TableSplit) splits.get(i);
449      totalRegionSize += ts.getLength();
450    }
451    long averageRegionSize = totalRegionSize / splits.size();
452    // totalRegionSize might be overflow, and the averageRegionSize must be positive.
453    if (averageRegionSize <= 0) {
454      LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " +
455          "set it to Long.MAX_VALUE " + splits.size());
456      averageRegionSize = Long.MAX_VALUE / splits.size();
457    }
458    //if averageRegionSize is too big, change it to default as 1 GB,
459    if (averageRegionSize > maxAverageRegionSize) {
460      averageRegionSize = maxAverageRegionSize;
461    }
462    // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large' region
463    // set default as 16M = (default hdfs block size) / 4;
464    if (averageRegionSize < 16 * 1048576) {
465      return splits;
466    }
467    for (int i = 0; i < splits.size(); i++) {
468      TableSplit ts = (TableSplit) splits.get(i);
469      TableName tableName = ts.getTable();
470      String regionLocation = ts.getRegionLocation();
471      String encodedRegionName = ts.getEncodedRegionName();
472      long regionSize = ts.getLength();
473
474      if (regionSize >= averageRegionSize) {
475        // make this region as multiple MapReduce input split.
476        int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0);
477        List<InputSplit> temp = createNInputSplitsUniform(ts, n);
478        resultList.addAll(temp);
479      } else {
480        // if the total size of several small continuous regions less than the average region size,
481        // combine them into one MapReduce input split.
482        long totalSize = regionSize;
483        byte[] splitStartKey = ts.getStartRow();
484        byte[] splitEndKey = ts.getEndRow();
485        int j = i + 1;
486        while (j < splits.size()) {
487          TableSplit nextRegion = (TableSplit) splits.get(j);
488          long nextRegionSize = nextRegion.getLength();
489          if (totalSize + nextRegionSize <= averageRegionSize
490              && Bytes.equals(splitEndKey, nextRegion.getStartRow())) {
491            totalSize = totalSize + nextRegionSize;
492            splitEndKey = nextRegion.getEndRow();
493            j++;
494          } else {
495            break;
496          }
497        }
498        i = j - 1;
499        // In the table input format for single table we do not need to
500        // store the scan object in table split because it can be memory intensive and redundant
501        // information to what is already stored in conf SCAN. See HBASE-25212
502        TableSplit t = new TableSplit(tableName, null, splitStartKey, splitEndKey, regionLocation,
503            encodedRegionName, totalSize);
504        resultList.add(t);
505      }
506    }
507    return resultList;
508  }
509
510  String reverseDNS(InetAddress ipAddress) throws UnknownHostException {
511    String hostName = this.reverseDNSCacheMap.get(ipAddress);
512    if (hostName == null) {
513      String ipAddressString = null;
514      try {
515        ipAddressString = DNS.reverseDns(ipAddress, null);
516      } catch (Exception e) {
517        // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the
518        // name service. Also, in case of ipv6, we need to use the InetAddress since resolving
519        // reverse DNS using jndi doesn't work well with ipv6 addresses.
520        ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
521      }
522      if (ipAddressString == null) {
523        throw new UnknownHostException("No host found for " + ipAddress);
524      }
525      hostName = Strings.domainNamePointerToHostName(ipAddressString);
526      this.reverseDNSCacheMap.put(ipAddress, hostName);
527    }
528    return hostName;
529  }
530
531  /**
532   * Test if the given region is to be included in the InputSplit while splitting
533   * the regions of a table.
534   * <p>
535   * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
536   * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
537   * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
538   * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
539   * <br>
540   * <br>
541   * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
542   * <br>
543   * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
544   *
545   *
546   * @param startKey Start key of the region
547   * @param endKey End key of the region
548   * @return true, if this region needs to be included as part of the input (default).
549   *
550   */
551  protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
552    return true;
553  }
554
555  /**
556   * Allows subclasses to get the {@link RegionLocator}.
557   */
558  protected RegionLocator getRegionLocator() {
559    if (regionLocator == null) {
560      throw new IllegalStateException(NOT_INITIALIZED);
561    }
562    return regionLocator;
563  }
564
565  /**
566   * Allows subclasses to get the {@link Table}.
567   */
568  protected Table getTable() {
569    if (table == null) {
570      throw new IllegalStateException(NOT_INITIALIZED);
571    }
572    return table;
573  }
574
575  /**
576   * Allows subclasses to get the {@link Admin}.
577   */
578  protected Admin getAdmin() {
579    if (admin == null) {
580      throw new IllegalStateException(NOT_INITIALIZED);
581    }
582    return admin;
583  }
584
585  /**
586   * Allows subclasses to initialize the table information.
587   *
588   * @param connection  The Connection to the HBase cluster. MUST be unmanaged. We will close.
589   * @param tableName  The {@link TableName} of the table to process.
590   * @throws IOException
591   */
592  protected void initializeTable(Connection connection, TableName tableName) throws IOException {
593    if (this.table != null || this.connection != null) {
594      LOG.warn("initializeTable called multiple times. Overwriting connection and table " +
595          "reference; TableInputFormatBase will not close these old references when done.");
596    }
597    this.table = connection.getTable(tableName);
598    this.regionLocator = connection.getRegionLocator(tableName);
599    this.admin = connection.getAdmin();
600    this.connection = connection;
601  }
602
603  @VisibleForTesting
604  protected RegionSizeCalculator createRegionSizeCalculator(RegionLocator locator, Admin admin)
605      throws IOException {
606    return new RegionSizeCalculator(locator, admin);
607  }
608
609  /**
610   * Gets the scan defining the actual details like columns etc.
611   *
612   * @return The internal scan instance.
613   */
614  public Scan getScan() {
615    if (this.scan == null) this.scan = new Scan();
616    return scan;
617  }
618
619  /**
620   * Sets the scan defining the actual details like columns etc.
621   *
622   * @param scan  The scan to set.
623   */
624  public void setScan(Scan scan) {
625    this.scan = scan;
626  }
627
628  /**
629   * Allows subclasses to set the {@link TableRecordReader}.
630   *
631   * @param tableRecordReader A different {@link TableRecordReader}
632   *   implementation.
633   */
634  protected void setTableRecordReader(TableRecordReader tableRecordReader) {
635    this.tableRecordReader = tableRecordReader;
636  }
637
638  /**
639   * Handle subclass specific set up.
640   * Each of the entry points used by the MapReduce framework,
641   * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
642   * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
643   * retrieving the necessary configuration information and calling
644   * {@link #initializeTable(Connection, TableName)}.
645   *
646   * Subclasses should implement their initialize call such that it is safe to call multiple times.
647   * The current TableInputFormatBase implementation relies on a non-null table reference to decide
648   * if an initialize call is needed, but this behavior may change in the future. In particular,
649   * it is critical that initializeTable not be called multiple times since this will leak
650   * Connection instances.
651   *
652   */
653  protected void initialize(JobContext context) throws IOException {
654  }
655
656  /**
657   * Close the Table and related objects that were initialized via
658   * {@link #initializeTable(Connection, TableName)}.
659   *
660   * @throws IOException
661   */
662  protected void closeTable() throws IOException {
663    close(admin, table, regionLocator, connection);
664    admin = null;
665    table = null;
666    regionLocator = null;
667    connection = null;
668  }
669
670  private void close(Closeable... closables) throws IOException {
671    for (Closeable c : closables) {
672      if(c != null) { c.close(); }
673    }
674  }
675
676}