View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.Closeable;
22  import java.io.IOException;
23  import java.net.InetAddress;
24  import java.net.InetSocketAddress;
25  import java.net.UnknownHostException;
26  import java.util.ArrayList;
27  import java.util.HashMap;
28  import java.util.List;
29  
30  import javax.naming.NamingException;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.hbase.classification.InterfaceStability;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HRegionLocation;
38  import org.apache.hadoop.hbase.TableName;
39  import org.apache.hadoop.hbase.client.Admin;
40  import org.apache.hadoop.hbase.client.Connection;
41  import org.apache.hadoop.hbase.client.ConnectionFactory;
42  import org.apache.hadoop.hbase.client.HTable;
43  import org.apache.hadoop.hbase.client.NeedUnmanagedConnectionException;
44  import org.apache.hadoop.hbase.client.RegionLocator;
45  import org.apache.hadoop.hbase.client.Result;
46  import org.apache.hadoop.hbase.client.Scan;
47  import org.apache.hadoop.hbase.client.Table;
48  import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
49  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
50  import org.apache.hadoop.hbase.util.Addressing;
51  import org.apache.hadoop.hbase.util.Bytes;
52  import org.apache.hadoop.hbase.util.Pair;
53  import org.apache.hadoop.hbase.util.RegionSizeCalculator;
54  import org.apache.hadoop.hbase.util.Strings;
55  import org.apache.hadoop.mapreduce.InputFormat;
56  import org.apache.hadoop.mapreduce.InputSplit;
57  import org.apache.hadoop.mapreduce.JobContext;
58  import org.apache.hadoop.mapreduce.RecordReader;
59  import org.apache.hadoop.mapreduce.TaskAttemptContext;
60  import org.apache.hadoop.net.DNS;
61  import org.apache.hadoop.util.StringUtils;
62  
63  /**
64   * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName},
65   * an {@link Scan} instance that defines the input columns etc. Subclasses may use
66   * other TableRecordReader implementations.
67   *
68   * Subclasses MUST ensure initializeTable(Connection, TableName) is called for an instance to
69   * function properly. Each of the entry points to this class used by the MapReduce framework,
70   * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
71   * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
72   * retrieving the necessary configuration information. If your subclass overrides either of these
73   * methods, either call the parent version or call initialize yourself.
74   *
75   * <p>
76   * An example of a subclass:
77   * <pre>
78   *   class ExampleTIF extends TableInputFormatBase {
79   *
80   *     {@literal @}Override
81   *     protected void initialize(JobContext context) throws IOException {
82   *       // We are responsible for the lifecycle of this connection until we hand it over in
83   *       // initializeTable.
84   *       Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create(
85   *              job.getConfiguration()));
86   *       TableName tableName = TableName.valueOf("exampleTable");
87   *       // mandatory. once passed here, TableInputFormatBase will handle closing the connection.
88   *       initializeTable(connection, tableName);
89   *       byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"),
90   *         Bytes.toBytes("columnB") };
91   *       // optional, by default we'll get everything for the table.
92   *       Scan scan = new Scan();
93   *       for (byte[] family : inputColumns) {
94   *         scan.addFamily(family);
95   *       }
96   *       Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*"));
97   *       scan.setFilter(exampleFilter);
98   *       setScan(scan);
99   *     }
100  *   }
101  *
102  * The number of InputSplits(mappers) match the number of regions in a table by default.
103  * Set "hbase.mapreduce.input.mappers.per.region" to specify how many mappers per region, set
104  * this property will disable autobalance below.
105  *
106  * Set "hbase.mapreduce.input.autobalance" to enable autobalance, hbase will assign mappers based on
107  * average region size; For regions, whose size larger than average region size may assigned more mappers,
108  * and for continuous small one, they may group together to use one mapper. If actual calculated average
109  * region size is too big, it is not good to only assign 1 mapper for those large regions. Then use
110  * "hbase.mapreduce.input.average.regionsize" to set max average region size when enable "autobalanece",
111  * default was average region size is 8G.
112  * </pre>
113  *
114  */
115 @InterfaceAudience.Public
116 @InterfaceStability.Stable
117 public abstract class TableInputFormatBase
118 extends InputFormat<ImmutableBytesWritable, Result> {
119 
120   private static final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
121 
122   private static final String NOT_INITIALIZED = "The input format instance has not been properly " +
123       "initialized. Ensure you call initializeTable either in your constructor or initialize " +
124       "method";
125   private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" +
126             " previous error. Please look at the previous logs lines from" +
127             " the task's full log for more details.";
128 
129   @Deprecated
130   /** Deprecated. No effect. */
131   public static final String INPUT_AUTOBALANCE_MAXSKEWRATIO = "hbase.mapreduce.input.autobalance" +
132       ".maxskewratio";
133   @Deprecated
134   /** Deprecated. No effect. */
135   public static final String TABLE_ROW_TEXTKEY = "hbase.table.row.textkey";
136 
137   /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */
138   public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.input.autobalance";
139   /** In auto-balance, we split input by ave region size, if calculated region size is too big, we can set it. */
140   public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.input.average.regionsize";
141 
142   /** Set the number of Mappers for each region, all regions have same number of Mappers */
143   public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.input.mappers.per.region";
144 
145   /** Holds the details for the internal scanner.
146    *
147    * @see Scan */
148   private Scan scan = null;
149   /** The {@link Admin}. */
150   private Admin admin;
151   /** The {@link Table} to scan. */
152   private Table table;
153   /** The {@link RegionLocator} of the table. */
154   private RegionLocator regionLocator;
155   /** The reader scanning the table, can be a custom one. */
156   private TableRecordReader tableRecordReader = null;
157   /** The underlying {@link Connection} of the table. */
158   private Connection connection;
159 
160 
161   /** The reverse DNS lookup cache mapping: IPAddress => HostName */
162   private HashMap<InetAddress, String> reverseDNSCacheMap =
163     new HashMap<InetAddress, String>();
164 
165   /**
166    * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses
167    * the default.
168    *
169    * @param split  The split to work with.
170    * @param context  The current context.
171    * @return The newly created record reader.
172    * @throws IOException When creating the reader fails.
173    * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
174    *   org.apache.hadoop.mapreduce.InputSplit,
175    *   org.apache.hadoop.mapreduce.TaskAttemptContext)
176    */
177   @Override
178   public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
179       InputSplit split, TaskAttemptContext context)
180   throws IOException {
181     // Just in case a subclass is relying on JobConfigurable magic.
182     if (table == null) {
183       initialize(context);
184     }
185     // null check in case our child overrides getTable to not throw.
186     try {
187       if (getTable() == null) {
188         // initialize() must not have been implemented in the subclass.
189         throw new IOException(INITIALIZATION_ERROR);
190       }
191     } catch (IllegalStateException exception) {
192       throw new IOException(INITIALIZATION_ERROR, exception);
193     }
194     TableSplit tSplit = (TableSplit) split;
195     LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
196     final TableRecordReader trr =
197         this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
198     Scan sc = new Scan(this.scan);
199     sc.setStartRow(tSplit.getStartRow());
200     sc.setStopRow(tSplit.getEndRow());
201     trr.setScan(sc);
202     trr.setTable(getTable());
203     return new RecordReader<ImmutableBytesWritable, Result>() {
204 
205       @Override
206       public void close() throws IOException {
207         trr.close();
208         closeTable();
209       }
210 
211       @Override
212       public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
213         return trr.getCurrentKey();
214       }
215 
216       @Override
217       public Result getCurrentValue() throws IOException, InterruptedException {
218         return trr.getCurrentValue();
219       }
220 
221       @Override
222       public float getProgress() throws IOException, InterruptedException {
223         return trr.getProgress();
224       }
225 
226       @Override
227       public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
228           InterruptedException {
229         trr.initialize(inputsplit, context);
230       }
231 
232       @Override
233       public boolean nextKeyValue() throws IOException, InterruptedException {
234         return trr.nextKeyValue();
235       }
236     };
237   }
238 
239   protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException {
240     return getRegionLocator().getStartEndKeys();
241   }
242 
243   /**
244    * Calculates the splits that will serve as input for the map tasks. The
245    * number of splits matches the number of regions in a table.
246    *
247    * @param context  The current job context.
248    * @return The list of input splits.
249    * @throws IOException When creating the list of splits fails.
250    * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
251    *   org.apache.hadoop.mapreduce.JobContext)
252    */
253   @Override
254   public List<InputSplit> getSplits(JobContext context) throws IOException {
255     boolean closeOnFinish = false;
256 
257     // Just in case a subclass is relying on JobConfigurable magic.
258     if (table == null) {
259       initialize(context);
260       closeOnFinish = true;
261     }
262 
263     // null check in case our child overrides getTable to not throw.
264     try {
265       if (getTable() == null) {
266         // initialize() must not have been implemented in the subclass.
267         throw new IOException(INITIALIZATION_ERROR);
268       }
269     } catch (IllegalStateException exception) {
270       throw new IOException(INITIALIZATION_ERROR, exception);
271     }
272     try {
273       List<InputSplit> splits = oneInputSplitPerRegion();
274 
275       // set same number of mappers for each region
276       if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) {
277         int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1);
278         List<InputSplit> res = new ArrayList<>();
279         for (int i = 0; i < splits.size(); i++) {
280           List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion);
281           res.addAll(tmp);
282         }
283         return res;
284       }
285 
286       //The default value of "hbase.mapreduce.input.autobalance" is false.
287       if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) {
288         long maxAveRegionSize = context.getConfiguration()
289             .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824); //8GB
290         return calculateAutoBalancedSplits(splits, maxAveRegionSize);
291       }
292 
293       // return one mapper per region
294       return splits;
295     } catch (NamingException e) {
296       throw new IOException(e);
297     } finally {
298       if (closeOnFinish) {
299         closeTable();
300       }
301     }
302   }
303 
304   /**
305    * Create one InputSplit per region
306    *
307    * @return The list of InputSplit for all the regions
308    * @throws IOException
309    */
310   private List<InputSplit> oneInputSplitPerRegion() throws IOException, NamingException {
311     RegionSizeCalculator sizeCalculator =
312         new RegionSizeCalculator(getRegionLocator(), getAdmin());
313 
314     TableName tableName = getTable().getName();
315 
316     Pair<byte[][], byte[][]> keys = getStartEndKeys();
317     if (keys == null || keys.getFirst() == null ||
318         keys.getFirst().length == 0) {
319       HRegionLocation regLoc =
320           getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
321       if (null == regLoc) {
322         throw new IOException("Expecting at least one region.");
323       }
324       List<InputSplit> splits = new ArrayList<>(1);
325       long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
326       // In the table input format for single table we do not need to
327       // store the scan object in table split because it can be memory intensive and redundant
328       // information to what is already stored in conf SCAN. See HBASE-25212
329       TableSplit split = new TableSplit(tableName, null,
330           HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
331           .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
332       splits.add(split);
333       return splits;
334     }
335     List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
336     for (int i = 0; i < keys.getFirst().length; i++) {
337       if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
338         continue;
339       }
340 
341       byte[] startRow = scan.getStartRow();
342       byte[] stopRow = scan.getStopRow();
343       // determine if the given start an stop key fall into the region
344       if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
345           Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
346           (stopRow.length == 0 ||
347               Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
348         byte[] splitStart = startRow.length == 0 ||
349             Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
350             keys.getFirst()[i] : startRow;
351         byte[] splitStop = (stopRow.length == 0 ||
352             Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
353             keys.getSecond()[i].length > 0 ?
354             keys.getSecond()[i] : stopRow;
355 
356         HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
357         // The below InetSocketAddress creation does a name resolution.
358         InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
359         if (isa.isUnresolved()) {
360           LOG.warn("Failed resolve " + isa);
361         }
362         InetAddress regionAddress = isa.getAddress();
363         String regionLocation;
364         regionLocation = reverseDNS(regionAddress);
365 
366         byte[] regionName = location.getRegionInfo().getRegionName();
367         String encodedRegionName = location.getRegionInfo().getEncodedName();
368         long regionSize = sizeCalculator.getRegionSize(regionName);
369         // In the table input format for single table we do not need to
370         // store the scan object in table split because it can be memory intensive and redundant
371         // information to what is already stored in conf SCAN. See HBASE-25212
372         TableSplit split = new TableSplit(tableName, null, splitStart, splitStop,
373           regionLocation, encodedRegionName, regionSize);
374         splits.add(split);
375         if (LOG.isDebugEnabled()) {
376           LOG.debug("getSplits: split -> " + i + " -> " + split);
377         }
378       }
379     }
380     return splits;
381   }
382 
383   /**
384    * Create n splits for one InputSplit, For now only support uniform distribution
385    * @param split A TableSplit corresponding to a range of rowkeys
386    * @param n     Number of ranges after splitting.  Pass 1 means no split for the range
387    *              Pass 2 if you want to split the range in two;
388    * @return A list of TableSplit, the size of the list is n
389    * @throws IllegalArgumentIOException
390    */
391   protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n)
392       throws IllegalArgumentIOException {
393     if (split == null || !(split instanceof TableSplit)) {
394       throw new IllegalArgumentIOException(
395           "InputSplit for CreateNSplitsPerRegion can not be null + "
396               + "and should be instance of TableSplit");
397     }
398     //if n < 1, then still continue using n = 1
399     n = n < 1 ? 1 : n;
400     List<InputSplit> res = new ArrayList<>(n);
401     if (n == 1) {
402       res.add(split);
403       return res;
404     }
405 
406     // Collect Region related information
407     TableSplit ts = (TableSplit) split;
408     TableName tableName = ts.getTable();
409     String regionLocation = ts.getRegionLocation();
410     String encodedRegionName = ts.getEncodedRegionName();
411     long regionSize = ts.getLength();
412     byte[] startRow = ts.getStartRow();
413     byte[] endRow = ts.getEndRow();
414 
415     // For special case: startRow or endRow is empty
416     if (startRow.length == 0 && endRow.length == 0){
417       startRow = new byte[1];
418       endRow = new byte[1];
419       startRow[0] = 0;
420       endRow[0] = -1;
421     }
422     if (startRow.length == 0 && endRow.length != 0){
423       startRow = new byte[1];
424       startRow[0] = 0;
425     }
426     if (startRow.length != 0 && endRow.length == 0){
427       endRow =new byte[startRow.length];
428       for (int k = 0; k < startRow.length; k++){
429         endRow[k] = -1;
430       }
431     }
432 
433     // Split Region into n chunks evenly
434     byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1);
435     for (int i = 0; i < splitKeys.length - 1; i++) {
436       // In the table input format for single table we do not need to
437       // store the scan object in table split because it can be memory intensive and redundant
438       // information to what is already stored in conf SCAN. See HBASE-25212
439       //notice that the regionSize parameter may be not very accurate
440       TableSplit tsplit =
441           new TableSplit(tableName, null, splitKeys[i], splitKeys[i + 1], regionLocation,
442               encodedRegionName, regionSize / n);
443       res.add(tsplit);
444     }
445     return res;
446   }
447 
448   /**
449    * Calculates the number of MapReduce input splits for the map tasks. The number of
450    * MapReduce input splits depends on the average region size.
451    * Make it 'public' for testing
452    * <p>
453    * Deprecated. Former functionality has been replaced by calculateAutoBalancedSplits and
454    * will function differently. Do not use.
455    * <p>
456    * @param list  The list of input splits before balance.
457    * @param context The current job context.
458    * @param average The average size of all regions .
459    * @return The list of input splits.
460    * @throws IOException
461    * @deprecated
462    */
463   @Deprecated
464   public List<InputSplit> calculateRebalancedSplits(List<InputSplit> list, JobContext context,
465       long average) throws IOException {
466     return calculateAutoBalancedSplits(list, average);
467   }
468 
469   /**
470    * Calculates the number of MapReduce input splits for the map tasks. The number of
471    * MapReduce input splits depends on the average region size.
472    * Make it 'public' for testing
473    *
474    * @param splits The list of input splits before balance.
475    * @param maxAverageRegionSize max Average region size for one mapper
476    * @return The list of input splits.
477    * @throws IOException When creating the list of splits fails.
478    * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
479    *org.apache.hadoop.mapreduce.JobContext)
480    */
481   public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize)
482       throws IOException {
483     if (splits.size() == 0) {
484       return splits;
485     }
486     List<InputSplit> resultList = new ArrayList<>();
487     long totalRegionSize = 0;
488     for (int i = 0; i < splits.size(); i++) {
489       TableSplit ts = (TableSplit) splits.get(i);
490       totalRegionSize += ts.getLength();
491     }
492     long averageRegionSize = totalRegionSize / splits.size();
493     // totalRegionSize might be overflow, and the averageRegionSize must be positive.
494     if (averageRegionSize <= 0) {
495       LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " +
496           "set it to Long.MAX_VALUE " + splits.size());
497       averageRegionSize = Long.MAX_VALUE / splits.size();
498     }
499     //if averageRegionSize is too big, change it to default as 8 GB,
500     if (averageRegionSize > maxAverageRegionSize) {
501       averageRegionSize = maxAverageRegionSize;
502     }
503     // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large' region
504     // set default as 64M = (default hdfs block size);
505     if (averageRegionSize < 64 * 1048576) {
506       return splits;
507     }
508     for (int i = 0; i < splits.size(); i++) {
509       TableSplit ts = (TableSplit) splits.get(i);
510       TableName tableName = ts.getTable();
511       String regionLocation = ts.getRegionLocation();
512       String encodedRegionName = ts.getEncodedRegionName();
513       long regionSize = ts.getLength();
514 
515       if (regionSize >= averageRegionSize) {
516         // make this region as multiple MapReduce input split.
517         int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0);
518         List<InputSplit> temp = createNInputSplitsUniform(ts, n);
519         resultList.addAll(temp);
520       } else {
521         // if the total size of several small continuous regions less than the average region size,
522         // combine them into one MapReduce input split.
523         long totalSize = regionSize;
524         byte[] splitStartKey = ts.getStartRow();
525         byte[] splitEndKey = ts.getEndRow();
526         int j = i + 1;
527         while (j < splits.size()) {
528           TableSplit nextRegion = (TableSplit) splits.get(j);
529           long nextRegionSize = nextRegion.getLength();
530           if (totalSize + nextRegionSize <= averageRegionSize) {
531             totalSize = totalSize + nextRegionSize;
532             splitEndKey = nextRegion.getEndRow();
533             j++;
534           } else {
535             break;
536           }
537         }
538         i = j - 1;
539         // In the table input format for single table, we do not need to
540         // store the scan object in table split because it can be memory intensive and redundant
541         // information to what is already stored in conf SCAN. See HBASE-25212
542         TableSplit t = new TableSplit(tableName, null, splitStartKey, splitEndKey, regionLocation,
543             encodedRegionName, totalSize);
544         resultList.add(t);
545       }
546     }
547     return resultList;
548   }
549 
550   /**
551    * Deprecated. Do not use.
552    * @param start Start key of the region
553    * @param end End key of the region
554    * @param isText It determines to use text key mode or binary key mode
555    * @return The split point in the region.
556    */
557   @Deprecated
558   public static byte[] getSplitKey(byte[] start, byte[] end, boolean isText) {
559     byte upperLimitByte;
560     byte lowerLimitByte;
561     //Use text mode or binary mode.
562     if (isText) {
563       //The range of text char set in ASCII is [32,126], the lower limit is space and the upper
564       // limit is '~'.
565       upperLimitByte = '~';
566       lowerLimitByte = ' ';
567     } else {
568       upperLimitByte = -1;
569       lowerLimitByte = 0;
570     }
571     // For special case
572     // Example 1 : startkey=null, endkey="hhhqqqwww", splitKey="h"
573     // Example 2 (text key mode): startKey="ffffaaa", endKey=null, splitkey="f~~~~~~"
574     if (start.length == 0 && end.length == 0){
575       return new byte[]{(byte) ((lowerLimitByte + upperLimitByte) / 2)};
576     }
577     if (start.length == 0 && end.length != 0){
578       return new byte[]{ end[0] };
579     }
580     if (start.length != 0 && end.length == 0){
581       byte[] result =new byte[start.length];
582       result[0]=start[0];
583       return result;
584     }
585     return Bytes.split(start, end, false, 1)[1];
586   }
587 
588   /**
589    * @deprecated mistakenly made public in 0.98.7. scope will change to package-private
590    */
591   @Deprecated
592   public String reverseDNS(InetAddress ipAddress) throws NamingException, UnknownHostException {
593     String hostName = this.reverseDNSCacheMap.get(ipAddress);
594     if (hostName == null) {
595       String ipAddressString = null;
596       try {
597         ipAddressString = DNS.reverseDns(ipAddress, null);
598       } catch (Exception e) {
599         // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the
600         // name service. Also, in case of ipv6, we need to use the InetAddress since resolving
601         // reverse DNS using jndi doesn't work well with ipv6 addresses.
602         ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
603       }
604       if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress);
605       hostName = Strings.domainNamePointerToHostName(ipAddressString);
606       this.reverseDNSCacheMap.put(ipAddress, hostName);
607     }
608     return hostName;
609   }
610 
611   /**
612    *
613    *
614    * Test if the given region is to be included in the InputSplit while splitting
615    * the regions of a table.
616    * <p>
617    * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
618    * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
619    * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
620    * continuously. In addition to reducing InputSplits, reduces the load on the region server as
621    * well, due to the ordering of the keys.
622    * <br>
623    * <br>
624    * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
625    * <br>
626    * Override this method, if you want to bulk exclude regions altogether from M-R.
627    * By default, no region is excluded( i.e. all regions are included).
628    *
629    *
630    * @param startKey Start key of the region
631    * @param endKey End key of the region
632    * @return true, if this region needs to be included as part of the input (default).
633    *
634    */
635   protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
636     return true;
637   }
638 
639   /**
640    * Allows subclasses to get the {@link HTable}.
641    *
642    * @deprecated use {@link #getTable()}
643    */
644   @Deprecated
645   protected HTable getHTable() {
646     return (HTable) this.getTable();
647   }
648 
649   /**
650    * Allows subclasses to get the {@link RegionLocator}.
651    */
652   protected RegionLocator getRegionLocator() {
653     if (regionLocator == null) {
654       throw new IllegalStateException(NOT_INITIALIZED);
655     }
656     return regionLocator;
657   }
658 
659   /**
660    * Allows subclasses to get the {@link Table}.
661    */
662   protected Table getTable() {
663     if (table == null) {
664       throw new IllegalStateException(NOT_INITIALIZED);
665     }
666     return table;
667   }
668 
669   /**
670    * Allows subclasses to get the {@link Admin}.
671    */
672   protected Admin getAdmin() {
673     if (admin == null) {
674       throw new IllegalStateException(NOT_INITIALIZED);
675     }
676     return admin;
677   }
678 
679   /**
680    * Allows subclasses to set the {@link HTable}.
681    *
682    * Will attempt to reuse the underlying Connection for our own needs, including
683    * retreiving an Admin interface to the HBase cluster.
684    *
685    * @param table  The table to get the data from.
686    * @throws IOException
687    * @deprecated Use {@link #initializeTable(Connection, TableName)} instead.
688    */
689   @Deprecated
690   protected void setHTable(HTable table) throws IOException {
691     this.table = table;
692     this.connection = table.getConnection();
693     try {
694       this.regionLocator = table.getRegionLocator();
695       this.admin = this.connection.getAdmin();
696     } catch (NeedUnmanagedConnectionException exception) {
697       LOG.warn("You are using an HTable instance that relies on an HBase-managed Connection. " +
698           "This is usually due to directly creating an HTable, which is deprecated. Instead, you " +
699           "should create a Connection object and then request a Table instance from it. If you " +
700           "don't need the Table instance for your own use, you should instead use the " +
701           "TableInputFormatBase.initalizeTable method directly.");
702       LOG.info("Creating an additional unmanaged connection because user provided one can't be " +
703           "used for administrative actions. We'll close it when we close out the table.");
704       LOG.debug("Details about our failure to request an administrative interface.", exception);
705       // Do we need a "copy the settings from this Connection" method? are things like the User
706       // properly maintained by just looking again at the Configuration?
707       this.connection = ConnectionFactory.createConnection(this.connection.getConfiguration());
708       this.regionLocator = this.connection.getRegionLocator(table.getName());
709       this.admin = this.connection.getAdmin();
710     }
711   }
712 
713   /**
714    * Allows subclasses to initialize the table information.
715    *
716    * @param connection  The {@link Connection} to the HBase cluster. MUST be unmanaged. We will close.
717    * @param tableName  The {@link TableName} of the table to process.
718    * @throws IOException
719    */
720   protected void initializeTable(Connection connection, TableName tableName) throws IOException {
721     if (this.table != null || this.connection != null) {
722       LOG.warn("initializeTable called multiple times. Overwriting connection and table " +
723           "reference; TableInputFormatBase will not close these old references when done.");
724     }
725     this.table = connection.getTable(tableName);
726     this.regionLocator = connection.getRegionLocator(tableName);
727     this.admin = connection.getAdmin();
728     this.connection = connection;
729   }
730 
731   /**
732    * Gets the scan defining the actual details like columns etc.
733    *
734    * @return The internal scan instance.
735    */
736   public Scan getScan() {
737     if (this.scan == null) this.scan = new Scan();
738     return scan;
739   }
740 
741   /**
742    * Sets the scan defining the actual details like columns etc.
743    *
744    * @param scan  The scan to set.
745    */
746   public void setScan(Scan scan) {
747     this.scan = scan;
748   }
749 
750   /**
751    * Allows subclasses to set the {@link TableRecordReader}.
752    *
753    * @param tableRecordReader A different {@link TableRecordReader}
754    *   implementation.
755    */
756   protected void setTableRecordReader(TableRecordReader tableRecordReader) {
757     this.tableRecordReader = tableRecordReader;
758   }
759 
760   /**
761    * Handle subclass specific set up.
762    * Each of the entry points used by the MapReduce framework,
763    * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
764    * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
765    * retrieving the necessary configuration information and calling
766    * {@link #initializeTable(Connection, TableName)}.
767    *
768    * Subclasses should implement their initialize call such that it is safe to call multiple times.
769    * The current TableInputFormatBase implementation relies on a non-null table reference to decide
770    * if an initialize call is needed, but this behavior may change in the future. In particular,
771    * it is critical that initializeTable not be called multiple times since this will leak
772    * Connection instances.
773    *
774    */
775   protected void initialize(JobContext context) throws IOException {
776   }
777 
778   /**
779    * Close the Table and related objects that were initialized via
780    * {@link #initializeTable(Connection, TableName)}.
781    *
782    * @throws IOException
783    */
784   protected void closeTable() throws IOException {
785     close(admin, table, regionLocator, connection);
786     admin = null;
787     table = null;
788     regionLocator = null;
789     connection = null;
790   }
791 
792   private void close(Closeable... closables) throws IOException {
793     for (Closeable c : closables) {
794       if(c != null) { c.close(); }
795     }
796   }
797 
798 }