1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.Closeable;
22 import java.io.IOException;
23 import java.net.InetAddress;
24 import java.net.InetSocketAddress;
25 import java.net.UnknownHostException;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29
30 import javax.naming.NamingException;
31
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.hbase.classification.InterfaceAudience;
35 import org.apache.hadoop.hbase.classification.InterfaceStability;
36 import org.apache.hadoop.hbase.HConstants;
37 import org.apache.hadoop.hbase.HRegionLocation;
38 import org.apache.hadoop.hbase.TableName;
39 import org.apache.hadoop.hbase.client.Admin;
40 import org.apache.hadoop.hbase.client.Connection;
41 import org.apache.hadoop.hbase.client.ConnectionFactory;
42 import org.apache.hadoop.hbase.client.HTable;
43 import org.apache.hadoop.hbase.client.NeedUnmanagedConnectionException;
44 import org.apache.hadoop.hbase.client.RegionLocator;
45 import org.apache.hadoop.hbase.client.Result;
46 import org.apache.hadoop.hbase.client.Scan;
47 import org.apache.hadoop.hbase.client.Table;
48 import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
49 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
50 import org.apache.hadoop.hbase.util.Addressing;
51 import org.apache.hadoop.hbase.util.Bytes;
52 import org.apache.hadoop.hbase.util.Pair;
53 import org.apache.hadoop.hbase.util.RegionSizeCalculator;
54 import org.apache.hadoop.hbase.util.Strings;
55 import org.apache.hadoop.mapreduce.InputFormat;
56 import org.apache.hadoop.mapreduce.InputSplit;
57 import org.apache.hadoop.mapreduce.JobContext;
58 import org.apache.hadoop.mapreduce.RecordReader;
59 import org.apache.hadoop.mapreduce.TaskAttemptContext;
60 import org.apache.hadoop.net.DNS;
61 import org.apache.hadoop.util.StringUtils;
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115 @InterfaceAudience.Public
116 @InterfaceStability.Stable
117 public abstract class TableInputFormatBase
118 extends InputFormat<ImmutableBytesWritable, Result> {
119
120 private static final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
121
122 private static final String NOT_INITIALIZED = "The input format instance has not been properly " +
123 "initialized. Ensure you call initializeTable either in your constructor or initialize " +
124 "method";
125 private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" +
126 " previous error. Please look at the previous logs lines from" +
127 " the task's full log for more details.";
128
129 @Deprecated
130
131 public static final String INPUT_AUTOBALANCE_MAXSKEWRATIO = "hbase.mapreduce.input.autobalance" +
132 ".maxskewratio";
133 @Deprecated
134
135 public static final String TABLE_ROW_TEXTKEY = "hbase.table.row.textkey";
136
137
138 public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.input.autobalance";
139
140 public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.input.average.regionsize";
141
142
143 public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.input.mappers.per.region";
144
145
146
147
148 private Scan scan = null;
149
150 private Admin admin;
151
152 private Table table;
153
154 private RegionLocator regionLocator;
155
156 private TableRecordReader tableRecordReader = null;
157
158 private Connection connection;
159
160
161
162 private HashMap<InetAddress, String> reverseDNSCacheMap =
163 new HashMap<InetAddress, String>();
164
165
166
167
168
169
170
171
172
173
174
175
176
177 @Override
178 public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
179 InputSplit split, TaskAttemptContext context)
180 throws IOException {
181
182 if (table == null) {
183 initialize(context);
184 }
185
186 try {
187 if (getTable() == null) {
188
189 throw new IOException(INITIALIZATION_ERROR);
190 }
191 } catch (IllegalStateException exception) {
192 throw new IOException(INITIALIZATION_ERROR, exception);
193 }
194 TableSplit tSplit = (TableSplit) split;
195 LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
196 final TableRecordReader trr =
197 this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
198 Scan sc = new Scan(this.scan);
199 sc.setStartRow(tSplit.getStartRow());
200 sc.setStopRow(tSplit.getEndRow());
201 trr.setScan(sc);
202 trr.setTable(getTable());
203 return new RecordReader<ImmutableBytesWritable, Result>() {
204
205 @Override
206 public void close() throws IOException {
207 trr.close();
208 closeTable();
209 }
210
211 @Override
212 public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
213 return trr.getCurrentKey();
214 }
215
216 @Override
217 public Result getCurrentValue() throws IOException, InterruptedException {
218 return trr.getCurrentValue();
219 }
220
221 @Override
222 public float getProgress() throws IOException, InterruptedException {
223 return trr.getProgress();
224 }
225
226 @Override
227 public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
228 InterruptedException {
229 trr.initialize(inputsplit, context);
230 }
231
232 @Override
233 public boolean nextKeyValue() throws IOException, InterruptedException {
234 return trr.nextKeyValue();
235 }
236 };
237 }
238
239 protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException {
240 return getRegionLocator().getStartEndKeys();
241 }
242
243
244
245
246
247
248
249
250
251
252
253 @Override
254 public List<InputSplit> getSplits(JobContext context) throws IOException {
255 boolean closeOnFinish = false;
256
257
258 if (table == null) {
259 initialize(context);
260 closeOnFinish = true;
261 }
262
263
264 try {
265 if (getTable() == null) {
266
267 throw new IOException(INITIALIZATION_ERROR);
268 }
269 } catch (IllegalStateException exception) {
270 throw new IOException(INITIALIZATION_ERROR, exception);
271 }
272 try {
273 List<InputSplit> splits = oneInputSplitPerRegion();
274
275
276 if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) {
277 int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1);
278 List<InputSplit> res = new ArrayList<>();
279 for (int i = 0; i < splits.size(); i++) {
280 List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion);
281 res.addAll(tmp);
282 }
283 return res;
284 }
285
286
287 if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) {
288 long maxAveRegionSize = context.getConfiguration()
289 .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824);
290 return calculateAutoBalancedSplits(splits, maxAveRegionSize);
291 }
292
293
294 return splits;
295 } catch (NamingException e) {
296 throw new IOException(e);
297 } finally {
298 if (closeOnFinish) {
299 closeTable();
300 }
301 }
302 }
303
304
305
306
307
308
309
310 private List<InputSplit> oneInputSplitPerRegion() throws IOException, NamingException {
311 RegionSizeCalculator sizeCalculator =
312 new RegionSizeCalculator(getRegionLocator(), getAdmin());
313
314 TableName tableName = getTable().getName();
315
316 Pair<byte[][], byte[][]> keys = getStartEndKeys();
317 if (keys == null || keys.getFirst() == null ||
318 keys.getFirst().length == 0) {
319 HRegionLocation regLoc =
320 getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
321 if (null == regLoc) {
322 throw new IOException("Expecting at least one region.");
323 }
324 List<InputSplit> splits = new ArrayList<>(1);
325 long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
326
327
328
329 TableSplit split = new TableSplit(tableName, null,
330 HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
331 .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
332 splits.add(split);
333 return splits;
334 }
335 List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
336 for (int i = 0; i < keys.getFirst().length; i++) {
337 if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
338 continue;
339 }
340
341 byte[] startRow = scan.getStartRow();
342 byte[] stopRow = scan.getStopRow();
343
344 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
345 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
346 (stopRow.length == 0 ||
347 Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
348 byte[] splitStart = startRow.length == 0 ||
349 Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
350 keys.getFirst()[i] : startRow;
351 byte[] splitStop = (stopRow.length == 0 ||
352 Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
353 keys.getSecond()[i].length > 0 ?
354 keys.getSecond()[i] : stopRow;
355
356 HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
357
358 InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
359 if (isa.isUnresolved()) {
360 LOG.warn("Failed resolve " + isa);
361 }
362 InetAddress regionAddress = isa.getAddress();
363 String regionLocation;
364 regionLocation = reverseDNS(regionAddress);
365
366 byte[] regionName = location.getRegionInfo().getRegionName();
367 String encodedRegionName = location.getRegionInfo().getEncodedName();
368 long regionSize = sizeCalculator.getRegionSize(regionName);
369
370
371
372 TableSplit split = new TableSplit(tableName, null, splitStart, splitStop,
373 regionLocation, encodedRegionName, regionSize);
374 splits.add(split);
375 if (LOG.isDebugEnabled()) {
376 LOG.debug("getSplits: split -> " + i + " -> " + split);
377 }
378 }
379 }
380 return splits;
381 }
382
383
384
385
386
387
388
389
390
391 protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n)
392 throws IllegalArgumentIOException {
393 if (split == null || !(split instanceof TableSplit)) {
394 throw new IllegalArgumentIOException(
395 "InputSplit for CreateNSplitsPerRegion can not be null + "
396 + "and should be instance of TableSplit");
397 }
398
399 n = n < 1 ? 1 : n;
400 List<InputSplit> res = new ArrayList<>(n);
401 if (n == 1) {
402 res.add(split);
403 return res;
404 }
405
406
407 TableSplit ts = (TableSplit) split;
408 TableName tableName = ts.getTable();
409 String regionLocation = ts.getRegionLocation();
410 String encodedRegionName = ts.getEncodedRegionName();
411 long regionSize = ts.getLength();
412 byte[] startRow = ts.getStartRow();
413 byte[] endRow = ts.getEndRow();
414
415
416 if (startRow.length == 0 && endRow.length == 0){
417 startRow = new byte[1];
418 endRow = new byte[1];
419 startRow[0] = 0;
420 endRow[0] = -1;
421 }
422 if (startRow.length == 0 && endRow.length != 0){
423 startRow = new byte[1];
424 startRow[0] = 0;
425 }
426 if (startRow.length != 0 && endRow.length == 0){
427 endRow =new byte[startRow.length];
428 for (int k = 0; k < startRow.length; k++){
429 endRow[k] = -1;
430 }
431 }
432
433
434 byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1);
435 for (int i = 0; i < splitKeys.length - 1; i++) {
436
437
438
439
440 TableSplit tsplit =
441 new TableSplit(tableName, null, splitKeys[i], splitKeys[i + 1], regionLocation,
442 encodedRegionName, regionSize / n);
443 res.add(tsplit);
444 }
445 return res;
446 }
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463 @Deprecated
464 public List<InputSplit> calculateRebalancedSplits(List<InputSplit> list, JobContext context,
465 long average) throws IOException {
466 return calculateAutoBalancedSplits(list, average);
467 }
468
469
470
471
472
473
474
475
476
477
478
479
480
481 public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize)
482 throws IOException {
483 if (splits.size() == 0) {
484 return splits;
485 }
486 List<InputSplit> resultList = new ArrayList<>();
487 long totalRegionSize = 0;
488 for (int i = 0; i < splits.size(); i++) {
489 TableSplit ts = (TableSplit) splits.get(i);
490 totalRegionSize += ts.getLength();
491 }
492 long averageRegionSize = totalRegionSize / splits.size();
493
494 if (averageRegionSize <= 0) {
495 LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " +
496 "set it to Long.MAX_VALUE " + splits.size());
497 averageRegionSize = Long.MAX_VALUE / splits.size();
498 }
499
500 if (averageRegionSize > maxAverageRegionSize) {
501 averageRegionSize = maxAverageRegionSize;
502 }
503
504
505 if (averageRegionSize < 64 * 1048576) {
506 return splits;
507 }
508 for (int i = 0; i < splits.size(); i++) {
509 TableSplit ts = (TableSplit) splits.get(i);
510 TableName tableName = ts.getTable();
511 String regionLocation = ts.getRegionLocation();
512 String encodedRegionName = ts.getEncodedRegionName();
513 long regionSize = ts.getLength();
514
515 if (regionSize >= averageRegionSize) {
516
517 int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0);
518 List<InputSplit> temp = createNInputSplitsUniform(ts, n);
519 resultList.addAll(temp);
520 } else {
521
522
523 long totalSize = regionSize;
524 byte[] splitStartKey = ts.getStartRow();
525 byte[] splitEndKey = ts.getEndRow();
526 int j = i + 1;
527 while (j < splits.size()) {
528 TableSplit nextRegion = (TableSplit) splits.get(j);
529 long nextRegionSize = nextRegion.getLength();
530 if (totalSize + nextRegionSize <= averageRegionSize) {
531 totalSize = totalSize + nextRegionSize;
532 splitEndKey = nextRegion.getEndRow();
533 j++;
534 } else {
535 break;
536 }
537 }
538 i = j - 1;
539
540
541
542 TableSplit t = new TableSplit(tableName, null, splitStartKey, splitEndKey, regionLocation,
543 encodedRegionName, totalSize);
544 resultList.add(t);
545 }
546 }
547 return resultList;
548 }
549
550
551
552
553
554
555
556
557 @Deprecated
558 public static byte[] getSplitKey(byte[] start, byte[] end, boolean isText) {
559 byte upperLimitByte;
560 byte lowerLimitByte;
561
562 if (isText) {
563
564
565 upperLimitByte = '~';
566 lowerLimitByte = ' ';
567 } else {
568 upperLimitByte = -1;
569 lowerLimitByte = 0;
570 }
571
572
573
574 if (start.length == 0 && end.length == 0){
575 return new byte[]{(byte) ((lowerLimitByte + upperLimitByte) / 2)};
576 }
577 if (start.length == 0 && end.length != 0){
578 return new byte[]{ end[0] };
579 }
580 if (start.length != 0 && end.length == 0){
581 byte[] result =new byte[start.length];
582 result[0]=start[0];
583 return result;
584 }
585 return Bytes.split(start, end, false, 1)[1];
586 }
587
588
589
590
591 @Deprecated
592 public String reverseDNS(InetAddress ipAddress) throws NamingException, UnknownHostException {
593 String hostName = this.reverseDNSCacheMap.get(ipAddress);
594 if (hostName == null) {
595 String ipAddressString = null;
596 try {
597 ipAddressString = DNS.reverseDns(ipAddress, null);
598 } catch (Exception e) {
599
600
601
602 ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
603 }
604 if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress);
605 hostName = Strings.domainNamePointerToHostName(ipAddressString);
606 this.reverseDNSCacheMap.put(ipAddress, hostName);
607 }
608 return hostName;
609 }
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635 protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
636 return true;
637 }
638
639
640
641
642
643
644 @Deprecated
645 protected HTable getHTable() {
646 return (HTable) this.getTable();
647 }
648
649
650
651
652 protected RegionLocator getRegionLocator() {
653 if (regionLocator == null) {
654 throw new IllegalStateException(NOT_INITIALIZED);
655 }
656 return regionLocator;
657 }
658
659
660
661
662 protected Table getTable() {
663 if (table == null) {
664 throw new IllegalStateException(NOT_INITIALIZED);
665 }
666 return table;
667 }
668
669
670
671
672 protected Admin getAdmin() {
673 if (admin == null) {
674 throw new IllegalStateException(NOT_INITIALIZED);
675 }
676 return admin;
677 }
678
679
680
681
682
683
684
685
686
687
688
689 @Deprecated
690 protected void setHTable(HTable table) throws IOException {
691 this.table = table;
692 this.connection = table.getConnection();
693 try {
694 this.regionLocator = table.getRegionLocator();
695 this.admin = this.connection.getAdmin();
696 } catch (NeedUnmanagedConnectionException exception) {
697 LOG.warn("You are using an HTable instance that relies on an HBase-managed Connection. " +
698 "This is usually due to directly creating an HTable, which is deprecated. Instead, you " +
699 "should create a Connection object and then request a Table instance from it. If you " +
700 "don't need the Table instance for your own use, you should instead use the " +
701 "TableInputFormatBase.initalizeTable method directly.");
702 LOG.info("Creating an additional unmanaged connection because user provided one can't be " +
703 "used for administrative actions. We'll close it when we close out the table.");
704 LOG.debug("Details about our failure to request an administrative interface.", exception);
705
706
707 this.connection = ConnectionFactory.createConnection(this.connection.getConfiguration());
708 this.regionLocator = this.connection.getRegionLocator(table.getName());
709 this.admin = this.connection.getAdmin();
710 }
711 }
712
713
714
715
716
717
718
719
720 protected void initializeTable(Connection connection, TableName tableName) throws IOException {
721 if (this.table != null || this.connection != null) {
722 LOG.warn("initializeTable called multiple times. Overwriting connection and table " +
723 "reference; TableInputFormatBase will not close these old references when done.");
724 }
725 this.table = connection.getTable(tableName);
726 this.regionLocator = connection.getRegionLocator(tableName);
727 this.admin = connection.getAdmin();
728 this.connection = connection;
729 }
730
731
732
733
734
735
736 public Scan getScan() {
737 if (this.scan == null) this.scan = new Scan();
738 return scan;
739 }
740
741
742
743
744
745
746 public void setScan(Scan scan) {
747 this.scan = scan;
748 }
749
750
751
752
753
754
755
756 protected void setTableRecordReader(TableRecordReader tableRecordReader) {
757 this.tableRecordReader = tableRecordReader;
758 }
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775 protected void initialize(JobContext context) throws IOException {
776 }
777
778
779
780
781
782
783
784 protected void closeTable() throws IOException {
785 close(admin, table, regionLocator, connection);
786 admin = null;
787 table = null;
788 regionLocator = null;
789 connection = null;
790 }
791
792 private void close(Closeable... closables) throws IOException {
793 for (Closeable c : closables) {
794 if(c != null) { c.close(); }
795 }
796 }
797
798 }