1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapred;
20
21 import java.io.IOException;
22 import java.util.Collection;
23 import java.util.Map;
24
25 import org.apache.hadoop.conf.Configuration;
26 import org.apache.hadoop.hbase.classification.InterfaceAudience;
27 import org.apache.hadoop.hbase.classification.InterfaceStability;
28 import org.apache.hadoop.fs.Path;
29 import org.apache.hadoop.hbase.HBaseConfiguration;
30 import org.apache.hadoop.hbase.MetaTableAccessor;
31 import org.apache.hadoop.hbase.TableName;
32 import org.apache.hadoop.hbase.client.Connection;
33 import org.apache.hadoop.hbase.client.ConnectionFactory;
34 import org.apache.hadoop.hbase.client.Put;
35 import org.apache.hadoop.hbase.client.Scan;
36 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
37 import org.apache.hadoop.hbase.mapreduce.MutationSerialization;
38 import org.apache.hadoop.hbase.mapreduce.ResultSerialization;
39 import org.apache.hadoop.hbase.security.User;
40 import org.apache.hadoop.hbase.security.UserProvider;
41 import org.apache.hadoop.hbase.security.token.TokenUtil;
42 import org.apache.hadoop.hbase.util.RegionSplitter;
43 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
44 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
45 import org.apache.hadoop.io.Text;
46 import org.apache.hadoop.mapred.FileInputFormat;
47 import org.apache.hadoop.mapred.InputFormat;
48 import org.apache.hadoop.mapred.JobConf;
49 import org.apache.hadoop.mapred.OutputFormat;
50 import org.apache.hadoop.mapred.TextInputFormat;
51 import org.apache.hadoop.mapred.TextOutputFormat;
52 import org.apache.hadoop.security.token.Token;
53 import org.apache.zookeeper.KeeperException;
54
55
56
57
58 @InterfaceAudience.Public
59 @InterfaceStability.Stable
60 @SuppressWarnings({ "rawtypes", "unchecked" })
61 public class TableMapReduceUtil {
62
63
64
65
66
67
68
69
70
71
72
73
74 public static void initTableMapJob(String table, String columns,
75 Class<? extends TableMap> mapper,
76 Class<?> outputKeyClass,
77 Class<?> outputValueClass, JobConf job) {
78 initTableMapJob(table, columns, mapper, outputKeyClass, outputValueClass, job,
79 true, TableInputFormat.class);
80 }
81
82 public static void initTableMapJob(String table, String columns,
83 Class<? extends TableMap> mapper,
84 Class<?> outputKeyClass,
85 Class<?> outputValueClass, JobConf job, boolean addDependencyJars) {
86 initTableMapJob(table, columns, mapper, outputKeyClass, outputValueClass, job,
87 addDependencyJars, TableInputFormat.class);
88 }
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 public static void initTableMapJob(String table, String columns,
104 Class<? extends TableMap> mapper,
105 Class<?> outputKeyClass,
106 Class<?> outputValueClass, JobConf job, boolean addDependencyJars,
107 Class<? extends InputFormat> inputFormat) {
108
109 job.setInputFormat(inputFormat);
110 job.setMapOutputValueClass(outputValueClass);
111 job.setMapOutputKeyClass(outputKeyClass);
112 job.setMapperClass(mapper);
113 job.setStrings("io.serializations", job.get("io.serializations"),
114 MutationSerialization.class.getName(), ResultSerialization.class.getName());
115 FileInputFormat.addInputPaths(job, table);
116 job.set(TableInputFormat.COLUMN_LIST, columns);
117 if (addDependencyJars) {
118 try {
119 addDependencyJars(job);
120 } catch (IOException e) {
121 e.printStackTrace();
122 }
123 }
124 try {
125 initCredentials(job);
126 } catch (IOException ioe) {
127
128 ioe.printStackTrace();
129 }
130 }
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146 public static void initMultiTableSnapshotMapperJob(Map<String, Collection<Scan>> snapshotScans,
147 Class<? extends TableMap> mapper, Class<?> outputKeyClass, Class<?> outputValueClass,
148 JobConf job, boolean addDependencyJars, Path tmpRestoreDir) throws IOException {
149 MultiTableSnapshotInputFormat.setInput(job, snapshotScans, tmpRestoreDir);
150
151 job.setInputFormat(MultiTableSnapshotInputFormat.class);
152 if (outputValueClass != null) {
153 job.setMapOutputValueClass(outputValueClass);
154 }
155 if (outputKeyClass != null) {
156 job.setMapOutputKeyClass(outputKeyClass);
157 }
158 job.setMapperClass(mapper);
159 if (addDependencyJars) {
160 addDependencyJars(job);
161 }
162
163 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.resetCacheConfig(job);
164 }
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185 public static void initTableSnapshotMapJob(String snapshotName, String columns,
186 Class<? extends TableMap> mapper,
187 Class<?> outputKeyClass,
188 Class<?> outputValueClass, JobConf job,
189 boolean addDependencyJars, Path tmpRestoreDir)
190 throws IOException {
191 TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir);
192 initTableMapJob(snapshotName, columns, mapper, outputKeyClass, outputValueClass, job,
193 addDependencyJars, TableSnapshotInputFormat.class);
194 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.resetCacheConfig(job);
195 }
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218 public static void initTableSnapshotMapJob(String snapshotName, String columns,
219 Class<? extends TableMap> mapper,
220 Class<?> outputKeyClass,
221 Class<?> outputValueClass, JobConf jobConf,
222 boolean addDependencyJars, Path tmpRestoreDir,
223 RegionSplitter.SplitAlgorithm splitAlgo,
224 int numSplitsPerRegion)
225 throws IOException {
226 TableSnapshotInputFormat.setInput(jobConf, snapshotName, tmpRestoreDir, splitAlgo,
227 numSplitsPerRegion);
228 initTableMapJob(snapshotName, columns, mapper, outputKeyClass, outputValueClass, jobConf,
229 addDependencyJars, TableSnapshotInputFormat.class);
230 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.resetCacheConfig(jobConf);
231 }
232
233
234
235
236
237
238
239
240
241
242
243 public static void initTableReduceJob(String table,
244 Class<? extends TableReduce> reducer, JobConf job)
245 throws IOException {
246 initTableReduceJob(table, reducer, job, null);
247 }
248
249
250
251
252
253
254
255
256
257
258
259
260 public static void initTableReduceJob(String table,
261 Class<? extends TableReduce> reducer, JobConf job, Class partitioner)
262 throws IOException {
263 initTableReduceJob(table, reducer, job, partitioner, true);
264 }
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279 public static void initTableReduceJob(String table,
280 Class<? extends TableReduce> reducer, JobConf job, Class partitioner,
281 boolean addDependencyJars) throws IOException {
282 job.setOutputFormat(TableOutputFormat.class);
283 job.setReducerClass(reducer);
284 job.set(TableOutputFormat.OUTPUT_TABLE, table);
285 job.setOutputKeyClass(ImmutableBytesWritable.class);
286 job.setOutputValueClass(Put.class);
287 job.setStrings("io.serializations", job.get("io.serializations"),
288 MutationSerialization.class.getName(), ResultSerialization.class.getName());
289 if (partitioner == HRegionPartitioner.class) {
290 job.setPartitionerClass(HRegionPartitioner.class);
291 int regions =
292 MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
293 if (job.getNumReduceTasks() > regions) {
294 job.setNumReduceTasks(regions);
295 }
296 } else if (partitioner != null) {
297 job.setPartitionerClass(partitioner);
298 }
299 if (addDependencyJars) {
300 addDependencyJars(job);
301 }
302 initCredentials(job);
303 }
304
305 public static void initCredentials(JobConf job) throws IOException {
306 UserProvider userProvider = UserProvider.instantiate(job);
307 if (userProvider.isHadoopSecurityEnabled()) {
308
309 if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
310 job.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
311 }
312 }
313
314 if (userProvider.isHBaseSecurityEnabled()) {
315 Connection conn = ConnectionFactory.createConnection(job);
316 try {
317
318 User user = userProvider.getCurrent();
319 TokenUtil.addTokenForJob(conn, job, user);
320 } catch (InterruptedException ie) {
321 ie.printStackTrace();
322 Thread.currentThread().interrupt();
323 } finally {
324 conn.close();
325 }
326 }
327 }
328
329
330
331
332
333
334
335
336
337
338 public static void limitNumReduceTasks(String table, JobConf job)
339 throws IOException {
340 int regions =
341 MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
342 if (job.getNumReduceTasks() > regions)
343 job.setNumReduceTasks(regions);
344 }
345
346
347
348
349
350
351
352
353
354
355 public static void limitNumMapTasks(String table, JobConf job)
356 throws IOException {
357 int regions =
358 MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
359 if (job.getNumMapTasks() > regions)
360 job.setNumMapTasks(regions);
361 }
362
363
364
365
366
367
368
369
370
371 public static void setNumReduceTasks(String table, JobConf job)
372 throws IOException {
373 job.setNumReduceTasks(MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job),
374 TableName.valueOf(table)));
375 }
376
377
378
379
380
381
382
383
384
385 public static void setNumMapTasks(String table, JobConf job)
386 throws IOException {
387 job.setNumMapTasks(MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job),
388 TableName.valueOf(table)));
389 }
390
391
392
393
394
395
396
397
398
399
400 public static void setScannerCaching(JobConf job, int batchSize) {
401 job.setInt("hbase.client.scanner.caching", batchSize);
402 }
403
404
405
406
407 public static void addDependencyJars(JobConf job) throws IOException {
408 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
409 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJarsForClasses(
410 job,
411
412
413 job.getMapOutputKeyClass(),
414 job.getMapOutputValueClass(),
415 job.getOutputKeyClass(),
416 job.getOutputValueClass(),
417 job.getPartitionerClass(),
418 job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
419 job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
420 job.getCombinerClass());
421 }
422 }