View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.util.hbck;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collection;
24  import java.util.HashSet;
25  import java.util.List;
26  import java.util.Set;
27  import java.util.concurrent.Callable;
28  import java.util.concurrent.ConcurrentSkipListSet;
29  import java.util.concurrent.ExecutionException;
30  import java.util.concurrent.ExecutorService;
31  import java.util.concurrent.Future;
32  import java.util.concurrent.atomic.AtomicInteger;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.hbase.classification.InterfaceAudience;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.HConstants;
42  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
43  import org.apache.hadoop.hbase.io.hfile.CorruptHFileException;
44  import org.apache.hadoop.hbase.io.hfile.HFile;
45  import org.apache.hadoop.hbase.util.FSUtils;
46  import org.apache.hadoop.hbase.util.FSUtils.FamilyDirFilter;
47  import org.apache.hadoop.hbase.util.FSUtils.HFileFilter;
48  import org.apache.hadoop.hbase.util.FSUtils.RegionDirFilter;
49  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
50  
51  /**
52   * This class marches through all of the region's hfiles and verifies that
53   * they are all valid files. One just needs to instantiate the class, use
54   * checkTables(List<Path>) and then retrieve the corrupted hfiles (and
55   * quarantined files if in quarantining mode)
56   *
57   * The implementation currently parallelizes at the regionDir level.
58   */
59  @InterfaceAudience.Private
60  public class HFileCorruptionChecker {
61    private static final Log LOG = LogFactory.getLog(HFileCorruptionChecker.class);
62  
63    final Configuration conf;
64    final FileSystem fs;
65    final CacheConfig cacheConf;
66    final ExecutorService executor;
67    final Set<Path> corrupted = new ConcurrentSkipListSet<Path>();
68    final Set<Path> failures = new ConcurrentSkipListSet<Path>();
69    final Set<Path> quarantined = new ConcurrentSkipListSet<Path>();
70    final Set<Path> missing = new ConcurrentSkipListSet<Path>();
71    final boolean inQuarantineMode;
72    final AtomicInteger hfilesChecked = new AtomicInteger();
73  
74    public HFileCorruptionChecker(Configuration conf, ExecutorService executor,
75        boolean quarantine) throws IOException {
76      this.conf = conf;
77      this.fs = FileSystem.get(conf);
78      this.cacheConf = new CacheConfig(conf);
79      this.executor = executor;
80      this.inQuarantineMode = quarantine;
81    }
82  
83    /**
84     * Checks a path to see if it is a valid hfile.
85     *
86     * @param p
87     *          full Path to an HFile
88     * @throws IOException
89     *           This is a connectivity related exception
90     */
91    protected void checkHFile(Path p) throws IOException {
92      HFile.Reader r = null;
93      try {
94        r = HFile.createReader(fs, p, cacheConf, conf);
95      } catch (CorruptHFileException che) {
96        LOG.warn("Found corrupt HFile " + p, che);
97        corrupted.add(p);
98        if (inQuarantineMode) {
99          Path dest = createQuarantinePath(p);
100         LOG.warn("Quarantining corrupt HFile " + p + " into " + dest);
101         boolean success = fs.mkdirs(dest.getParent());
102         success = success ? fs.rename(p, dest): false;
103         if (!success) {
104           failures.add(p);
105         } else {
106           quarantined.add(dest);
107         }
108       }
109       return;
110     } catch (FileNotFoundException fnfe) {
111       LOG.warn("HFile " + p + " was missing.  Likely removed due to compaction/split?");
112       missing.add(p);
113     } finally {
114       hfilesChecked.addAndGet(1);
115       if (r != null) {
116         r.close(true);
117       }
118     }
119   }
120 
121   /**
122    * Given a path, generates a new path to where we move a corrupted hfile (bad
123    * trailer, no trailer).
124    *
125    * @param hFile
126    *          Path to a corrupt hfile (assumes that it is HBASE_DIR/ table
127    *          /region/cf/file)
128    * @return path to where corrupted files are stored. This should be
129    *         HBASE_DIR/.corrupt/table/region/cf/file.
130    */
131   Path createQuarantinePath(Path hFile) throws IOException {
132     // extract the normal dirs structure
133     Path cfDir = hFile.getParent();
134     Path regionDir = cfDir.getParent();
135     Path tableDir = regionDir.getParent();
136 
137     // build up the corrupted dirs strcture
138     Path corruptBaseDir = new Path(FSUtils.getRootDir(conf), conf.get(
139         "hbase.hfile.quarantine.dir", HConstants.CORRUPT_DIR_NAME));
140     Path corruptTableDir = new Path(corruptBaseDir, tableDir.getName());
141     Path corruptRegionDir = new Path(corruptTableDir, regionDir.getName());
142     Path corruptFamilyDir = new Path(corruptRegionDir, cfDir.getName());
143     Path corruptHfile = new Path(corruptFamilyDir, hFile.getName());
144     return corruptHfile;
145   }
146 
147   /**
148    * Check all files in a column family dir.
149    *
150    * @param cfDir
151    *          column family directory
152    * @throws IOException
153    */
154   protected void checkColFamDir(Path cfDir) throws IOException {
155     FileStatus[] statuses = null;
156     try {
157       statuses = fs.listStatus(cfDir); // use same filter as scanner.
158     } catch (FileNotFoundException fnfe) {
159       // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
160       LOG.warn("Colfam Directory " + cfDir +
161           " does not exist.  Likely due to concurrent split/compaction. Skipping.");
162       missing.add(cfDir);
163       return;
164     }
165 
166     List<FileStatus> hfs = FSUtils.filterFileStatuses(statuses, new HFileFilter(fs));
167     // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
168     if (hfs.size() == 0 && !fs.exists(cfDir)) {
169       LOG.warn("Colfam Directory " + cfDir +
170           " does not exist.  Likely due to concurrent split/compaction. Skipping.");
171       missing.add(cfDir);
172       return;
173     }
174     for (FileStatus hfFs : hfs) {
175       Path hf = hfFs.getPath();
176       checkHFile(hf);
177     }
178   }
179 
180   /**
181    * Check all column families in a region dir.
182    *
183    * @param regionDir
184    *          region directory
185    * @throws IOException
186    */
187   protected void checkRegionDir(Path regionDir) throws IOException {
188     FileStatus[] statuses = null;
189     try {
190       statuses = fs.listStatus(regionDir);
191     } catch (FileNotFoundException fnfe) {
192       // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
193       LOG.warn("Region Directory " + regionDir +
194           " does not exist.  Likely due to concurrent split/compaction. Skipping.");
195       missing.add(regionDir);
196       return;
197     }
198 
199     List<FileStatus> cfs = FSUtils.filterFileStatuses(statuses, new FamilyDirFilter(fs));
200     // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
201     if (cfs.size() == 0 && !fs.exists(regionDir)) {
202       LOG.warn("Region Directory " + regionDir +
203           " does not exist.  Likely due to concurrent split/compaction. Skipping.");
204       missing.add(regionDir);
205       return;
206     }
207 
208     for (FileStatus cfFs : cfs) {
209       Path cfDir = cfFs.getPath();
210       checkColFamDir(cfDir);
211     }
212   }
213 
214   /**
215    * Check all the regiondirs in the specified tableDir
216    *
217    * @param tableDir
218    *          path to a table
219    * @throws IOException
220    */
221   void checkTableDir(Path tableDir) throws IOException {
222     List<FileStatus> rds = FSUtils.listStatusWithStatusFilter(fs, tableDir, new RegionDirFilter(fs));
223     if (rds == null) {
224       if (!fs.exists(tableDir)) {
225         LOG.warn("Table Directory " + tableDir +
226             " does not exist.  Likely due to concurrent delete. Skipping.");
227         missing.add(tableDir);
228       }
229       return;
230     }
231 
232     // Parallelize check at the region dir level
233     List<RegionDirChecker> rdcs = new ArrayList<RegionDirChecker>();
234     List<Future<Void>> rdFutures;
235 
236     for (FileStatus rdFs : rds) {
237       Path rdDir = rdFs.getPath();
238       RegionDirChecker work = new RegionDirChecker(rdDir);
239       rdcs.add(work);
240     }
241 
242     // Submit and wait for completion
243     try {
244       rdFutures = executor.invokeAll(rdcs);
245     } catch (InterruptedException ie) {
246       Thread.currentThread().interrupt();
247       LOG.warn("Region dirs checking interrupted!", ie);
248       return;
249     }
250 
251     for (int i = 0; i < rdFutures.size(); i++) {
252       Future<Void> f = rdFutures.get(i);
253       try {
254         f.get();
255       } catch (ExecutionException e) {
256         LOG.warn("Failed to quarantine an HFile in regiondir "
257             + rdcs.get(i).regionDir, e.getCause());
258         // rethrow IOExceptions
259         if (e.getCause() instanceof IOException) {
260           throw (IOException) e.getCause();
261         }
262 
263         // rethrow RuntimeExceptions
264         if (e.getCause() instanceof RuntimeException) {
265           throw (RuntimeException) e.getCause();
266         }
267 
268         // this should never happen
269         LOG.error("Unexpected exception encountered", e);
270         return; // bailing out.
271       } catch (InterruptedException ie) {
272         Thread.currentThread().interrupt();
273         LOG.warn("Region dirs check interrupted!", ie);
274         // bailing out
275         return;
276       }
277     }
278   }
279 
280   /**
281    * An individual work item for parallelized regiondir processing. This is
282    * intentionally an inner class so it can use the shared error sets and fs.
283    */
284   private class RegionDirChecker implements Callable<Void> {
285     final Path regionDir;
286 
287     RegionDirChecker(Path regionDir) {
288       this.regionDir = regionDir;
289     }
290 
291     @Override
292     public Void call() throws IOException {
293       checkRegionDir(regionDir);
294       return null;
295     }
296   }
297 
298   /**
299    * Check the specified table dirs for bad hfiles.
300    */
301   public void checkTables(Collection<Path> tables) throws IOException {
302     for (Path t : tables) {
303       checkTableDir(t);
304     }
305   }
306 
307   /**
308    * @return the set of check failure file paths after checkTables is called.
309    */
310   public Collection<Path> getFailures() {
311     return new HashSet<Path>(failures);
312   }
313 
314   /**
315    * @return the set of corrupted file paths after checkTables is called.
316    */
317   public Collection<Path> getCorrupted() {
318     return new HashSet<Path>(corrupted);
319   }
320 
321   /**
322    * @return number of hfiles checked in the last HfileCorruptionChecker run
323    */
324   public int getHFilesChecked() {
325     return hfilesChecked.get();
326   }
327 
328   /**
329    * @return the set of successfully quarantined paths after checkTables is called.
330    */
331   public Collection<Path> getQuarantined() {
332     return new HashSet<Path>(quarantined);
333   }
334 
335   /**
336    * @return the set of paths that were missing.  Likely due to deletion/moves from
337    *  compaction or flushes.
338    */
339   public Collection<Path> getMissing() {
340     return new HashSet<Path>(missing);
341   }
342 
343   /**
344    * Print a human readable summary of hfile quarantining operations.
345    * @param out
346    */
347   public void report(ErrorReporter out) {
348     out.print("Checked " + hfilesChecked.get() + " hfile for corruption");
349     out.print("  HFiles corrupted:                  " + corrupted.size());
350     if (inQuarantineMode) {
351       out.print("    HFiles successfully quarantined: " + quarantined.size());
352       for (Path sq : quarantined) {
353         out.print("      " + sq);
354       }
355       out.print("    HFiles failed quarantine:        " + failures.size());
356       for (Path fq : failures) {
357         out.print("      " + fq);
358       }
359     }
360     out.print("    HFiles moved while checking:     " + missing.size());
361     for (Path mq : missing) {
362       out.print("      " + mq);
363     }
364 
365     String initialState = (corrupted.size() == 0) ? "OK" : "CORRUPTED";
366     String fixedState = (corrupted.size() == quarantined.size()) ? "OK"
367         : "CORRUPTED";
368 
369     if (inQuarantineMode) {
370       out.print("Summary: " + initialState + " => " + fixedState);
371     } else {
372       out.print("Summary: " + initialState);
373     }
374   }
375 }