001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.mapreduce; 019 020import java.io.IOException; 021import org.apache.hadoop.conf.Configuration; 022import org.apache.hadoop.conf.Configured; 023import org.apache.hadoop.fs.Path; 024import org.apache.hadoop.hbase.Cell; 025import org.apache.hadoop.hbase.CellUtil; 026import org.apache.hadoop.hbase.CompareOperator; 027import org.apache.hadoop.hbase.HBaseConfiguration; 028import org.apache.hadoop.hbase.HConstants; 029import org.apache.hadoop.hbase.client.Result; 030import org.apache.hadoop.hbase.client.Scan; 031import org.apache.hadoop.hbase.filter.Filter; 032import org.apache.hadoop.hbase.filter.PrefixFilter; 033import org.apache.hadoop.hbase.filter.RegexStringComparator; 034import org.apache.hadoop.hbase.filter.RowFilter; 035import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.apache.hadoop.io.IntWritable; 038import org.apache.hadoop.io.Text; 039import org.apache.hadoop.mapreduce.Job; 040import org.apache.hadoop.mapreduce.Reducer; 041import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 042import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 043import org.apache.hadoop.util.Tool; 044import org.apache.hadoop.util.ToolRunner; 045import org.apache.yetus.audience.InterfaceAudience; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 050 051/** 052 * A job with a a map and reduce phase to count cells in a table. The counter lists the following 053 * stats for a given table: 054 * 055 * <pre> 056 * 1. Total number of rows in the table 057 * 2. Total number of CFs across all rows 058 * 3. Total qualifiers across all rows 059 * 4. Total occurrence of each CF 060 * 5. Total occurrence of each qualifier 061 * 6. Total number of versions of each qualifier. 062 * </pre> 063 * 064 * The cellcounter can take optional parameters to use a user supplied row/family/qualifier string 065 * to use in the report and second a regex based or prefix based row filter to restrict the count 066 * operation to a limited subset of rows from the table or a start time and/or end time to limit the 067 * count to a time range. 068 */ 069@InterfaceAudience.Public 070public class CellCounter extends Configured implements Tool { 071 private static final Logger LOG = LoggerFactory.getLogger(CellCounter.class.getName()); 072 073 /** 074 * Name of this 'program'. 075 */ 076 static final String NAME = "CellCounter"; 077 078 private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name"; 079 080 /** 081 * Mapper that runs the count. 082 */ 083 static class CellCounterMapper extends TableMapper<Text, IntWritable> { 084 /** 085 * Counter enumeration to count the actual rows. 086 */ 087 public static enum Counters { 088 ROWS, 089 CELLS 090 } 091 092 private Configuration conf; 093 private String separator; 094 095 // state of current row, family, column needs to persist across map() invocations 096 // in order to properly handle scanner batching, where a single qualifier may have too 097 // many versions for a single map() call 098 private byte[] lastRow; 099 private String currentRowKey; 100 byte[] currentFamily = null; 101 String currentFamilyName = null; 102 byte[] currentQualifier = null; 103 // family + qualifier 104 String currentQualifierName = null; 105 // rowkey + family + qualifier 106 String currentRowQualifierName = null; 107 108 @Override 109 protected void setup(Context context) throws IOException, InterruptedException { 110 conf = context.getConfiguration(); 111 separator = conf.get("ReportSeparator", ":"); 112 } 113 114 /** 115 * Maps the data. 116 * @param row The current table row key. 117 * @param values The columns. 118 * @param context The current context. 119 * @throws IOException When something is broken with the data. 120 */ 121 122 @Override 123 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH", 124 justification = "Findbugs is blind to the Precondition null check") 125 public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException { 126 Preconditions.checkState(values != null, "values passed to the map is null"); 127 128 try { 129 byte[] currentRow = values.getRow(); 130 if (lastRow == null || !Bytes.equals(lastRow, currentRow)) { 131 lastRow = currentRow; 132 currentRowKey = Bytes.toStringBinary(currentRow); 133 currentFamily = null; 134 currentQualifier = null; 135 context.getCounter(Counters.ROWS).increment(1); 136 context.write(new Text("Total ROWS"), new IntWritable(1)); 137 } 138 if (!values.isEmpty()) { 139 int cellCount = 0; 140 for (Cell value : values.listCells()) { 141 cellCount++; 142 if (currentFamily == null || !CellUtil.matchingFamily(value, currentFamily)) { 143 currentFamily = CellUtil.cloneFamily(value); 144 currentFamilyName = Bytes.toStringBinary(currentFamily); 145 currentQualifier = null; 146 context.getCounter("CF", currentFamilyName).increment(1); 147 if (1 == context.getCounter("CF", currentFamilyName).getValue()) { 148 context.write(new Text("Total Families Across all Rows"), new IntWritable(1)); 149 context.write(new Text(currentFamily), new IntWritable(1)); 150 } 151 } 152 if (currentQualifier == null || !CellUtil.matchingQualifier(value, currentQualifier)) { 153 currentQualifier = CellUtil.cloneQualifier(value); 154 currentQualifierName = 155 currentFamilyName + separator + Bytes.toStringBinary(currentQualifier); 156 currentRowQualifierName = currentRowKey + separator + currentQualifierName; 157 158 context.write(new Text("Total Qualifiers across all Rows"), new IntWritable(1)); 159 context.write(new Text(currentQualifierName), new IntWritable(1)); 160 } 161 // Increment versions 162 context.write(new Text(currentRowQualifierName + "_Versions"), new IntWritable(1)); 163 } 164 context.getCounter(Counters.CELLS).increment(cellCount); 165 } 166 } catch (InterruptedException e) { 167 e.printStackTrace(); 168 } 169 } 170 } 171 172 static class IntSumReducer<Key> extends Reducer<Key, IntWritable, Key, IntWritable> { 173 174 private IntWritable result = new IntWritable(); 175 176 public void reduce(Key key, Iterable<IntWritable> values, Context context) 177 throws IOException, InterruptedException { 178 int sum = 0; 179 for (IntWritable val : values) { 180 sum += val.get(); 181 } 182 result.set(sum); 183 context.write(key, result); 184 } 185 } 186 187 /** 188 * Sets up the actual job. 189 * @param conf The current configuration. 190 * @param args The command line parameters. 191 * @return The newly created job. 192 * @throws IOException When setting up the job fails. 193 */ 194 public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { 195 String tableName = args[0]; 196 Path outputDir = new Path(args[1]); 197 String reportSeparatorString = (args.length > 2) ? args[2] : ":"; 198 conf.set("ReportSeparator", reportSeparatorString); 199 Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); 200 job.setJarByClass(CellCounter.class); 201 Scan scan = getConfiguredScanForJob(conf, args); 202 TableMapReduceUtil.initTableMapperJob(tableName, scan, CellCounterMapper.class, 203 ImmutableBytesWritable.class, Result.class, job); 204 job.setMapOutputKeyClass(Text.class); 205 job.setMapOutputValueClass(IntWritable.class); 206 job.setOutputFormatClass(TextOutputFormat.class); 207 job.setOutputKeyClass(Text.class); 208 job.setOutputValueClass(IntWritable.class); 209 FileOutputFormat.setOutputPath(job, outputDir); 210 job.setReducerClass(IntSumReducer.class); 211 job.setCombinerClass(IntSumReducer.class); 212 return job; 213 } 214 215 private static Scan getConfiguredScanForJob(Configuration conf, String[] args) 216 throws IOException { 217 // create scan with any properties set from TableInputFormat 218 Scan s = TableInputFormat.createScanFromConfiguration(conf); 219 // Set Scan Versions 220 if (conf.get(TableInputFormat.SCAN_MAXVERSIONS) == null) { 221 // default to all versions unless explicitly set 222 s.setMaxVersions(Integer.MAX_VALUE); 223 } 224 s.setCacheBlocks(false); 225 // Set RowFilter or Prefix Filter if applicable. 226 Filter rowFilter = getRowFilter(args); 227 if (rowFilter != null) { 228 LOG.info("Setting Row Filter for counter."); 229 s.setFilter(rowFilter); 230 } 231 // Set TimeRange if defined 232 long timeRange[] = getTimeRange(args); 233 if (timeRange != null) { 234 LOG.info("Setting TimeRange for counter."); 235 s.setTimeRange(timeRange[0], timeRange[1]); 236 } 237 return s; 238 } 239 240 private static Filter getRowFilter(String[] args) { 241 Filter rowFilter = null; 242 String filterCriteria = (args.length > 3) ? args[3] : null; 243 if (filterCriteria == null) return null; 244 if (filterCriteria.startsWith("^")) { 245 String regexPattern = filterCriteria.substring(1, filterCriteria.length()); 246 rowFilter = new RowFilter(CompareOperator.EQUAL, new RegexStringComparator(regexPattern)); 247 } else { 248 rowFilter = new PrefixFilter(Bytes.toBytesBinary(filterCriteria)); 249 } 250 return rowFilter; 251 } 252 253 private static long[] getTimeRange(String[] args) throws IOException { 254 final String startTimeArgKey = "--starttime="; 255 final String endTimeArgKey = "--endtime="; 256 long startTime = 0L; 257 long endTime = 0L; 258 259 for (int i = 1; i < args.length; i++) { 260 System.out.println("i:" + i + "arg[i]" + args[i]); 261 if (args[i].startsWith(startTimeArgKey)) { 262 startTime = Long.parseLong(args[i].substring(startTimeArgKey.length())); 263 } 264 if (args[i].startsWith(endTimeArgKey)) { 265 endTime = Long.parseLong(args[i].substring(endTimeArgKey.length())); 266 } 267 } 268 269 if (startTime == 0 && endTime == 0) return null; 270 271 endTime = endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime; 272 return new long[] { startTime, endTime }; 273 } 274 275 @Override 276 public int run(String[] args) throws Exception { 277 if (args.length < 2) { 278 printUsage(args.length); 279 return -1; 280 } 281 Job job = createSubmittableJob(getConf(), args); 282 return (job.waitForCompletion(true) ? 0 : 1); 283 } 284 285 private void printUsage(int parameterCount) { 286 System.err.println("ERROR: Wrong number of parameters: " + parameterCount); 287 System.err.println("Usage: hbase cellcounter <tablename> <outputDir> [reportSeparator] " 288 + "[^[regex pattern] or [Prefix]] [--starttime=<starttime> --endtime=<endtime>]"); 289 System.err.println(" Note: -D properties will be applied to the conf used."); 290 System.err.println(" Additionally, all of the SCAN properties from TableInputFormat can be " 291 + "specified to get fine grained control on what is counted."); 292 System.err.println(" -D" + TableInputFormat.SCAN_ROW_START + "=<rowkey>"); 293 System.err.println(" -D" + TableInputFormat.SCAN_ROW_STOP + "=<rowkey>"); 294 System.err.println(" -D" + TableInputFormat.SCAN_COLUMNS + "=\"<col1> <col2>...\""); 295 System.err.println(" -D" + TableInputFormat.SCAN_COLUMN_FAMILY + "=<family1>,<family2>, ..."); 296 System.err.println(" -D" + TableInputFormat.SCAN_TIMESTAMP + "=<timestamp>"); 297 System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_START + "=<timestamp>"); 298 System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_END + "=<timestamp>"); 299 System.err.println(" -D" + TableInputFormat.SCAN_MAXVERSIONS + "=<count>"); 300 System.err.println(" -D" + TableInputFormat.SCAN_CACHEDROWS + "=<count>"); 301 System.err.println(" -D" + TableInputFormat.SCAN_BATCHSIZE + "=<count>"); 302 System.err.println(" <reportSeparator> parameter can be used to override the default report " 303 + "separator string : used to separate the rowId/column family name and qualifier name."); 304 System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell " 305 + "counter count operation to a limited subset of rows from the table based on regex or " 306 + "prefix pattern."); 307 } 308 309 /** 310 * Main entry point. 311 * @param args The command line parameters. 312 * @throws Exception When running the job fails. 313 */ 314 public static void main(String[] args) throws Exception { 315 int errCode = ToolRunner.run(HBaseConfiguration.create(), new CellCounter(), args); 316 System.exit(errCode); 317 } 318 319}