View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver.wal;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.EOFException;
27  import java.io.IOException;
28  import java.io.InterruptedIOException;
29  import java.util.ArrayList;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.Set;
33  import java.util.concurrent.atomic.AtomicBoolean;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.fs.FileSystem;
39  import org.apache.hadoop.fs.Path;
40  import org.apache.hadoop.hbase.Cell;
41  import org.apache.hadoop.hbase.HBaseTestingUtility;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HRegionInfo;
45  import org.apache.hadoop.hbase.HTableDescriptor;
46  import org.apache.hadoop.hbase.testclassification.LargeTests;
47  import org.apache.hadoop.hbase.MiniHBaseCluster;
48  import org.apache.hadoop.hbase.ServerName;
49  import org.apache.hadoop.hbase.TableName;
50  import org.apache.hadoop.hbase.Waiter;
51  import org.apache.hadoop.hbase.client.Admin;
52  import org.apache.hadoop.hbase.client.Get;
53  import org.apache.hadoop.hbase.client.HTable;
54  import org.apache.hadoop.hbase.client.Put;
55  import org.apache.hadoop.hbase.client.Result;
56  import org.apache.hadoop.hbase.client.ResultScanner;
57  import org.apache.hadoop.hbase.client.Scan;
58  import org.apache.hadoop.hbase.client.Table;
59  import org.apache.hadoop.hbase.fs.HFileSystem;
60  import org.apache.hadoop.hbase.regionserver.HRegionServer;
61  import org.apache.hadoop.hbase.regionserver.Region;
62  import org.apache.hadoop.hbase.regionserver.Store;
63  import org.apache.hadoop.hbase.util.Bytes;
64  import org.apache.hadoop.hbase.util.FSUtils;
65  import org.apache.hadoop.hbase.util.JVMClusterUtil;
66  import org.apache.hadoop.hbase.util.Threads;
67  import org.apache.hadoop.hbase.wal.DefaultWALProvider;
68  import org.apache.hadoop.hbase.wal.WAL;
69  import org.apache.hadoop.hbase.wal.WAL.Entry;
70  import org.apache.hadoop.hbase.wal.WALFactory;
71  import org.apache.hadoop.hbase.wal.WALProvider.Writer;
72  import org.apache.hadoop.hdfs.MiniDFSCluster;
73  import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
74  import org.apache.hadoop.hdfs.server.datanode.DataNode;
75  import org.junit.After;
76  import org.junit.Assert;
77  import org.junit.Before;
78  import org.junit.BeforeClass;
79  import org.junit.Rule;
80  import org.junit.Test;
81  import org.junit.experimental.categories.Category;
82  import org.junit.rules.TestName;
83  
84  /**
85   * Test log deletion as logs are rolled.
86   */
87  @Category(LargeTests.class)
88  public class TestLogRolling  {
89    private static final Log LOG = LogFactory.getLog(TestLogRolling.class);
90    private HRegionServer server;
91    private String tableName;
92    private byte[] value;
93    private FileSystem fs;
94    private MiniDFSCluster dfsCluster;
95    private Admin admin;
96    private MiniHBaseCluster cluster;
97    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
98    @Rule public final TestName name = new TestName();
99  
100   public TestLogRolling()  {
101     this.server = null;
102     this.tableName = null;
103 
104     String className = this.getClass().getName();
105     StringBuilder v = new StringBuilder(className);
106     while (v.length() < 1000) {
107       v.append(className);
108     }
109     this.value = Bytes.toBytes(v.toString());
110   }
111 
112   // Need to override this setup so we can edit the config before it gets sent
113   // to the HDFS & HBase cluster startup.
114   @BeforeClass
115   public static void setUpBeforeClass() throws Exception {
116     // TODO: testLogRollOnDatanodeDeath fails if short circuit reads are on under the hadoop2
117     // profile. See HBASE-9337 for related issues.
118     System.setProperty("hbase.tests.use.shortcircuit.reads", "false");
119 
120     /**** configuration for testLogRolling ****/
121     // Force a region split after every 768KB
122     TEST_UTIL.getConfiguration().setLong(HConstants.HREGION_MAX_FILESIZE, 768L * 1024L);
123 
124     // We roll the log after every 32 writes
125     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.maxlogentries", 32);
126 
127     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2);
128     TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
129 
130     // For less frequently updated regions flush after every 2 flushes
131     TEST_UTIL.getConfiguration().setInt("hbase.hregion.memstore.optionalflushcount", 2);
132 
133     // We flush the cache after every 8192 bytes
134     TEST_UTIL.getConfiguration().setInt(
135         HConstants.HREGION_MEMSTORE_FLUSH_SIZE, 8192);
136 
137     // Increase the amount of time between client retries
138     TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 10 * 1000);
139 
140     // Reduce thread wake frequency so that other threads can get
141     // a chance to run.
142     TEST_UTIL.getConfiguration().setInt(HConstants.THREAD_WAKE_FREQUENCY, 2 * 1000);
143 
144    /**** configuration for testLogRollOnDatanodeDeath ****/
145    // make sure log.hflush() calls syncFs() to open a pipeline
146     TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
147    // lower the namenode & datanode heartbeat so the namenode
148    // quickly detects datanode failures
149     TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
150     TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
151     // the namenode might still try to choose the recently-dead datanode
152     // for a pipeline, so try to a new pipeline multiple times
153     TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 30);
154     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.hlog.tolerable.lowreplication", 2);
155     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.hlog.lowreplication.rolllimit", 3);
156 
157     // For slow sync threshold test: roll after 5 slow syncs in 10 seconds
158     TEST_UTIL.getConfiguration().setInt(FSHLog.SLOW_SYNC_ROLL_THRESHOLD, 5);
159     TEST_UTIL.getConfiguration().setInt(FSHLog.SLOW_SYNC_ROLL_INTERVAL_MS, 10 * 1000);
160     // For slow sync threshold test: roll once after a sync above this threshold
161     TEST_UTIL.getConfiguration().setInt(FSHLog.ROLL_ON_SYNC_TIME_MS, 5000);
162   }
163 
164   @Before
165   public void setUp() throws Exception {
166     TEST_UTIL.startMiniCluster(1, 1, 2);
167 
168     cluster = TEST_UTIL.getHBaseCluster();
169     dfsCluster = TEST_UTIL.getDFSCluster();
170     fs = TEST_UTIL.getTestFileSystem();
171     admin = TEST_UTIL.getHBaseAdmin();
172 
173     // disable region rebalancing (interferes with log watching)
174     cluster.getMaster().balanceSwitch(false);
175   }
176 
177   @After
178   public void tearDown() throws Exception  {
179     TEST_UTIL.shutdownMiniCluster();
180   }
181 
182   private void startAndWriteData() throws IOException, InterruptedException {
183     // When the hbase:meta table can be opened, the region servers are running
184     new HTable(TEST_UTIL.getConfiguration(), TableName.META_TABLE_NAME);
185     this.server = cluster.getRegionServerThreads().get(0).getRegionServer();
186 
187     Table table = createTestTable(this.tableName);
188 
189     server = TEST_UTIL.getRSForFirstRegionInTable(table.getName());
190     for (int i = 1; i <= 256; i++) {    // 256 writes should cause 8 log rolls
191       doPut(table, i);
192       if (i % 32 == 0) {
193         // After every 32 writes sleep to let the log roller run
194         try {
195           Thread.sleep(2000);
196         } catch (InterruptedException e) {
197           // continue
198         }
199       }
200     }
201   }
202 
203   /**
204    * Tests that log rolling doesn't hang when no data is written.
205    */
206   @Test(timeout=120000)
207   public void testLogRollOnNothingWritten() throws Exception {
208     final Configuration conf = TEST_UTIL.getConfiguration();
209     final WALFactory wals = new WALFactory(conf, null,
210         ServerName.valueOf("test.com",8080, 1).toString());
211     final WAL newLog = wals.getWAL(new byte[]{}, null);
212     try {
213       // Now roll the log before we write anything.
214       newLog.rollWriter(true);
215     } finally {
216       wals.close();
217     }
218   }
219 
220   /**
221    * Tests that logs are deleted
222    * @throws IOException
223    * @throws org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException
224    */
225   @Test
226   public void testLogRolling() throws Exception {
227     this.tableName = getName();
228     // TODO: Why does this write data take for ever?
229     startAndWriteData();
230     HRegionInfo region =
231         server.getOnlineRegions(TableName.valueOf(tableName)).get(0).getRegionInfo();
232     final WAL log = server.getWAL(region);
233     LOG.info("after writing there are " + DefaultWALProvider.getNumRolledLogFiles(log) +
234         " log files");
235 
236     // flush all regions
237     for (Region r: server.getOnlineRegionsLocalContext()) {
238       r.flush(true);
239     }
240 
241     // Now roll the log
242     log.rollWriter();
243 
244     int count = DefaultWALProvider.getNumRolledLogFiles(log);
245     LOG.info("after flushing all regions and rolling logs there are " + count + " log files");
246       assertTrue(("actual count: " + count), count <= 2);
247   }
248 
249   @Test
250   public void testSlowSyncLogRolling() throws Exception {
251     // Create the test table
252     HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(getName()));
253     desc.addFamily(new HColumnDescriptor(HConstants.CATALOG_FAMILY));
254     admin.createTable(desc);
255     Table table = TEST_UTIL.getConnection().getTable(desc.getTableName());
256     int row = 1;
257     try {
258       assertTrue(((HTable) table).isAutoFlush());
259 
260       // Get a reference to the FSHLog
261       server = TEST_UTIL.getRSForFirstRegionInTable(desc.getTableName());
262       HRegionInfo region = server.getOnlineRegions(desc.getTableName()).get(0).getRegionInfo();
263       final FSHLog log = (FSHLog) server.getWAL(region);
264 
265       // Register a WALActionsListener to observe if a SLOW_SYNC roll is requested
266 
267       final AtomicBoolean slowSyncHookCalled = new AtomicBoolean();
268       log.registerWALActionsListener(new WALActionsListener.Base() {
269         @Override
270         public void logRollRequested(WALActionsListener.RollRequestReason reason) {
271           switch (reason) {
272             case SLOW_SYNC:
273               slowSyncHookCalled.lazySet(true);
274               break;
275             default:
276               break;
277           }
278         }
279       });
280 
281       // Write some data
282 
283       for (int i = 0; i < 10; i++) {
284         writeData(table, row++);
285       }
286 
287       assertFalse("Should not have triggered log roll due to SLOW_SYNC",
288         slowSyncHookCalled.get());
289 
290       // Set up for test
291       slowSyncHookCalled.set(false);
292 
293       // Wrap the current writer with the anonymous class below that adds 200 ms of
294       // latency to any sync on the hlog. This should be more than sufficient to trigger
295       // slow sync warnings.
296       final Writer oldWriter1 = log.getWriter();
297       final Writer newWriter1 = new Writer() {
298         @Override
299         public void close() throws IOException {
300           oldWriter1.close();
301         }
302         @Override
303         public void sync(boolean forceSync) throws IOException {
304           try {
305             Thread.sleep(200);
306           } catch (InterruptedException e) {
307             InterruptedIOException ex = new InterruptedIOException();
308             ex.initCause(e);
309             throw ex;
310           }
311           oldWriter1.sync(forceSync);
312         }
313         @Override
314         public void append(Entry entry) throws IOException {
315           oldWriter1.append(entry);
316         }
317         @Override
318         public long getLength() throws IOException {
319           return oldWriter1.getLength();
320         }
321       };
322       log.setWriter(newWriter1);
323 
324       // Write some data.
325       // We need to write at least 5 times, but double it. We should only request
326       // a SLOW_SYNC roll once in the current interval.
327       for (int i = 0; i < 10; i++) {
328         writeData(table, row++);
329       }
330 
331       // Wait for our wait injecting writer to get rolled out, as needed.
332 
333       TEST_UTIL.waitFor(10000, 100, new Waiter.ExplainingPredicate<Exception>() {
334         @Override
335         public boolean evaluate() throws Exception {
336           return log.getWriter() != newWriter1;
337         }
338         @Override
339         public String explainFailure() throws Exception {
340           return "Waited too long for our test writer to get rolled out";
341         }
342       });
343 
344       assertTrue("Should have triggered log roll due to SLOW_SYNC",
345         slowSyncHookCalled.get());
346 
347       // Set up for test
348       slowSyncHookCalled.set(false);
349 
350       // Wrap the current writer with the anonymous class below that adds 5000 ms of
351       // latency to any sync on the hlog.
352       // This will trip the other threshold.
353       final Writer oldWriter2 = log.getWriter();
354       final Writer newWriter2 = new Writer() {
355         @Override
356         public void close() throws IOException {
357           oldWriter2.close();
358         }
359         @Override
360         public void sync(boolean forceSync) throws IOException {
361           try {
362             Thread.sleep(5000);
363           } catch (InterruptedException e) {
364             InterruptedIOException ex = new InterruptedIOException();
365             ex.initCause(e);
366             throw ex;
367           }
368           oldWriter2.sync(forceSync);
369         }
370         @Override
371         public void append(Entry entry) throws IOException {
372           oldWriter2.append(entry);
373         }
374         @Override
375         public long getLength() throws IOException {
376           return oldWriter2.getLength();
377         }
378       };
379       log.setWriter(newWriter2);
380 
381       // Write some data. Should only take one sync.
382 
383       writeData(table, row++);
384 
385       // Wait for our wait injecting writer to get rolled out, as needed.
386 
387       TEST_UTIL.waitFor(10000, 100, new Waiter.ExplainingPredicate<Exception>() {
388         @Override
389         public boolean evaluate() throws Exception {
390           return log.getWriter() != newWriter2;
391         }
392         @Override
393         public String explainFailure() throws Exception {
394           return "Waited too long for our test writer to get rolled out";
395         }
396       });
397 
398       assertTrue("Should have triggered log roll due to SLOW_SYNC",
399         slowSyncHookCalled.get());
400 
401       // Set up for test
402       slowSyncHookCalled.set(false);
403 
404       // Write some data
405       for (int i = 0; i < 10; i++) {
406         writeData(table, row++);
407       }
408 
409       assertFalse("Should not have triggered log roll due to SLOW_SYNC",
410         slowSyncHookCalled.get());
411 
412     } finally {
413       table.close();
414     }
415   }
416 
417   private String getName() {
418     return "TestLogRolling-" + name.getMethodName();
419   }
420 
421   void writeData(Table table, int rownum) throws IOException {
422     doPut(table, rownum);
423 
424     // sleep to let the log roller run (if it needs to)
425     try {
426       Thread.sleep(2000);
427     } catch (InterruptedException e) {
428       // continue
429     }
430   }
431 
432   void validateData(Table table, int rownum) throws IOException {
433     String row = "row" + String.format("%1$04d", rownum);
434     Get get = new Get(Bytes.toBytes(row));
435     get.addFamily(HConstants.CATALOG_FAMILY);
436     Result result = table.get(get);
437     assertTrue(result.size() == 1);
438     assertTrue(Bytes.equals(value,
439                 result.getValue(HConstants.CATALOG_FAMILY, null)));
440     LOG.info("Validated row " + row);
441   }
442 
443   void batchWriteAndWait(Table table, final FSHLog log, int start, boolean expect, int timeout)
444       throws IOException {
445     for (int i = 0; i < 10; i++) {
446       Put put = new Put(Bytes.toBytes("row"
447           + String.format("%1$04d", (start + i))));
448       put.add(HConstants.CATALOG_FAMILY, null, value);
449       table.put(put);
450     }
451     Put tmpPut = new Put(Bytes.toBytes("tmprow"));
452     tmpPut.add(HConstants.CATALOG_FAMILY, null, value);
453     long startTime = System.currentTimeMillis();
454     long remaining = timeout;
455     while (remaining > 0) {
456       if (log.isLowReplicationRollEnabled() == expect) {
457         break;
458       } else {
459         // Trigger calling FSHlog#checkLowReplication()
460         table.put(tmpPut);
461         try {
462           Thread.sleep(200);
463         } catch (InterruptedException e) {
464           // continue
465         }
466         remaining = timeout - (System.currentTimeMillis() - startTime);
467       }
468     }
469   }
470 
471   /**
472    * Tests that logs are rolled upon detecting datanode death
473    * Requires an HDFS jar with HDFS-826 & syncFs() support (HDFS-200)
474    */
475   @Test
476   public void testLogRollOnDatanodeDeath() throws Exception {
477     TEST_UTIL.ensureSomeRegionServersAvailable(2);
478     assertTrue("This test requires WAL file replication set to 2.",
479       fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()) == 2);
480     LOG.info("Replication=" +
481       fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()));
482 
483     this.server = cluster.getRegionServer(0);
484 
485     // Create the test table and open it
486     HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(getName()));
487     desc.addFamily(new HColumnDescriptor(HConstants.CATALOG_FAMILY));
488 
489     admin.createTable(desc);
490     Table table = TEST_UTIL.getConnection().getTable(desc.getTableName());
491     assertTrue(((HTable) table).isAutoFlush());
492 
493     server = TEST_UTIL.getRSForFirstRegionInTable(desc.getTableName());
494     HRegionInfo region = server.getOnlineRegions(desc.getTableName()).get(0).getRegionInfo();
495     final FSHLog log = (FSHLog) server.getWAL(region);
496     final AtomicBoolean lowReplicationHookCalled = new AtomicBoolean(false);
497     log.registerWALActionsListener(new WALActionsListener.Base() {
498       @Override
499       public void logRollRequested(WALActionsListener.RollRequestReason reason) {
500         switch (reason) {
501           case LOW_REPLICATION:
502             lowReplicationHookCalled.lazySet(true);
503             break;
504           default:
505             break;
506         }
507       }
508     });
509 
510     // don't run this test without append support (HDFS-200 & HDFS-142)
511     assertTrue("Need append support for this test", FSUtils
512         .isAppendSupported(TEST_UTIL.getConfiguration()));
513 
514     // add up the datanode count, to ensure proper replication when we kill 1
515     // This function is synchronous; when it returns, the dfs cluster is active
516     // We start 3 servers and then stop 2 to avoid a directory naming conflict
517     //  when we stop/start a namenode later, as mentioned in HBASE-5163
518     List<DataNode> existingNodes = dfsCluster.getDataNodes();
519     int numDataNodes = 3;
520     dfsCluster.startDataNodes(TEST_UTIL.getConfiguration(), numDataNodes, true,
521         null, null);
522     List<DataNode> allNodes = dfsCluster.getDataNodes();
523     for (int i = allNodes.size()-1; i >= 0; i--) {
524       if (existingNodes.contains(allNodes.get(i))) {
525         dfsCluster.stopDataNode( i );
526       }
527     }
528 
529     assertTrue("DataNodes " + dfsCluster.getDataNodes().size() +
530         " default replication " +
531         fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()),
532     dfsCluster.getDataNodes().size() >=
533       fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()) + 1);
534 
535     writeData(table, 2);
536 
537     long curTime = System.currentTimeMillis();
538     LOG.info("log.getCurrentFileName(): " + log.getCurrentFileName());
539     long oldFilenum = DefaultWALProvider.extractFileNumFromWAL(log);
540     assertTrue("Log should have a timestamp older than now",
541         curTime > oldFilenum && oldFilenum != -1);
542 
543     assertTrue("The log shouldn't have rolled yet",
544         oldFilenum == DefaultWALProvider.extractFileNumFromWAL(log));
545     final DatanodeInfo[] pipeline = log.getPipeLine();
546     assertTrue(pipeline.length ==
547         fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()));
548 
549     // kill a datanode in the pipeline to force a log roll on the next sync()
550     // This function is synchronous, when it returns the node is killed.
551     assertTrue(dfsCluster.stopDataNode(pipeline[0].getName()) != null);
552 
553     // this write should succeed, but trigger a log roll
554     writeData(table, 2);
555     long newFilenum = DefaultWALProvider.extractFileNumFromWAL(log);
556 
557     assertTrue("Missing datanode should've triggered a log roll",
558         newFilenum > oldFilenum && newFilenum > curTime);
559 
560     assertTrue("The log rolling hook should have been called with the low replication flag",
561         lowReplicationHookCalled.get());
562 
563     // write some more log data (this should use a new hdfs_out)
564     writeData(table, 3);
565     assertTrue("The log should not roll again.",
566         DefaultWALProvider.extractFileNumFromWAL(log) == newFilenum);
567     // kill another datanode in the pipeline, so the replicas will be lower than
568     // the configured value 2.
569     assertTrue(dfsCluster.stopDataNode(pipeline[1].getName()) != null);
570 
571     batchWriteAndWait(table, log, 3, false, 14000);
572     int replication = log.getLogReplication();
573     assertTrue("LowReplication Roller should've been disabled, current replication="
574             + replication, !log.isLowReplicationRollEnabled());
575 
576     dfsCluster
577         .startDataNodes(TEST_UTIL.getConfiguration(), 1, true, null, null);
578 
579     // Force roll writer. The new log file will have the default replications,
580     // and the LowReplication Roller will be enabled.
581     log.rollWriter(true);
582     batchWriteAndWait(table, log, 13, true, 10000);
583     replication = log.getLogReplication();
584     assertTrue("New log file should have the default replication instead of " +
585       replication,
586       replication == fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()));
587     assertTrue("LowReplication Roller should've been enabled", log.isLowReplicationRollEnabled());
588   }
589 
590   /**
591    * Test that WAL is rolled when all data nodes in the pipeline have been
592    * restarted.
593    * @throws Exception
594    */
595   @Test
596   public void testLogRollOnPipelineRestart() throws Exception {
597     LOG.info("Starting testLogRollOnPipelineRestart");
598     assertTrue("This test requires WAL file replication.",
599       fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()) > 1);
600     LOG.info("Replication=" +
601       fs.getDefaultReplication(TEST_UTIL.getDataTestDirOnTestFS()));
602     // When the hbase:meta table can be opened, the region servers are running
603     Table t = new HTable(TEST_UTIL.getConfiguration(), TableName.META_TABLE_NAME);
604     try {
605       this.server = cluster.getRegionServer(0);
606 
607       // Create the test table and open it
608       HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(getName()));
609       desc.addFamily(new HColumnDescriptor(HConstants.CATALOG_FAMILY));
610 
611       admin.createTable(desc);
612       Table table = new HTable(TEST_UTIL.getConfiguration(), desc.getTableName());
613 
614       server = TEST_UTIL.getRSForFirstRegionInTable(desc.getTableName());
615       HRegionInfo region = server.getOnlineRegions(desc.getTableName()).get(0).getRegionInfo();
616       final WAL log = server.getWAL(region);
617       final List<Path> paths = new ArrayList<Path>();
618       final List<Integer> preLogRolledCalled = new ArrayList<Integer>();
619 
620       paths.add(DefaultWALProvider.getCurrentFileName(log));
621       log.registerWALActionsListener(new WALActionsListener.Base() {
622 
623         @Override
624         public void preLogRoll(Path oldFile, Path newFile)  {
625           LOG.debug("preLogRoll: oldFile="+oldFile+" newFile="+newFile);
626           preLogRolledCalled.add(new Integer(1));
627         }
628         @Override
629         public void postLogRoll(Path oldFile, Path newFile) {
630           paths.add(newFile);
631         }
632       });
633 
634       // don't run this test without append support (HDFS-200 & HDFS-142)
635       assertTrue("Need append support for this test", FSUtils
636           .isAppendSupported(TEST_UTIL.getConfiguration()));
637 
638       writeData(table, 1002);
639 
640       long curTime = System.currentTimeMillis();
641       LOG.info("log.getCurrentFileName()): " + DefaultWALProvider.getCurrentFileName(log));
642       long oldFilenum = DefaultWALProvider.extractFileNumFromWAL(log);
643       assertTrue("Log should have a timestamp older than now",
644           curTime > oldFilenum && oldFilenum != -1);
645 
646       assertTrue("The log shouldn't have rolled yet", oldFilenum ==
647           DefaultWALProvider.extractFileNumFromWAL(log));
648 
649       // roll all datanodes in the pipeline
650       dfsCluster.restartDataNodes();
651       Thread.sleep(1000);
652       dfsCluster.waitActive();
653       LOG.info("Data Nodes restarted");
654       validateData(table, 1002);
655 
656       // this write should succeed, but trigger a log roll
657       writeData(table, 1003);
658       long newFilenum = DefaultWALProvider.extractFileNumFromWAL(log);
659 
660       assertTrue("Missing datanode should've triggered a log roll",
661           newFilenum > oldFilenum && newFilenum > curTime);
662       validateData(table, 1003);
663 
664       writeData(table, 1004);
665 
666       // roll all datanode again
667       dfsCluster.restartDataNodes();
668       Thread.sleep(1000);
669       dfsCluster.waitActive();
670       LOG.info("Data Nodes restarted");
671       validateData(table, 1004);
672 
673       // this write should succeed, but trigger a log roll
674       writeData(table, 1005);
675 
676       // force a log roll to read back and verify previously written logs
677       log.rollWriter(true);
678       assertTrue("preLogRolledCalled has size of " + preLogRolledCalled.size(),
679           preLogRolledCalled.size() >= 1);
680 
681       // read back the data written
682       Set<String> loggedRows = new HashSet<String>();
683       FSUtils fsUtils = FSUtils.getInstance(fs, TEST_UTIL.getConfiguration());
684       for (Path p : paths) {
685         LOG.debug("recovering lease for " + p);
686         fsUtils.recoverFileLease(((HFileSystem)fs).getBackingFs(), p,
687           TEST_UTIL.getConfiguration(), null);
688 
689         LOG.debug("Reading WAL "+FSUtils.getPath(p));
690         WAL.Reader reader = null;
691         try {
692           reader = WALFactory.createReader(fs, p, TEST_UTIL.getConfiguration());
693           WAL.Entry entry;
694           while ((entry = reader.next()) != null) {
695             LOG.debug("#"+entry.getKey().getLogSeqNum()+": "+entry.getEdit().getCells());
696             for (Cell cell : entry.getEdit().getCells()) {
697               loggedRows.add(Bytes.toStringBinary(cell.getRow()));
698             }
699           }
700         } catch (EOFException e) {
701           LOG.debug("EOF reading file "+FSUtils.getPath(p));
702         } finally {
703           if (reader != null) reader.close();
704         }
705       }
706 
707       // verify the written rows are there
708       assertTrue(loggedRows.contains("row1002"));
709       assertTrue(loggedRows.contains("row1003"));
710       assertTrue(loggedRows.contains("row1004"));
711       assertTrue(loggedRows.contains("row1005"));
712 
713       // flush all regions
714       for (Region r: server.getOnlineRegionsLocalContext()) {
715         try {
716           r.flush(true);
717         } catch (Exception e) {
718           // This try/catch was added by HBASE-14317. It is needed
719           // because this issue tightened up the semantic such that
720           // a failed append could not be followed by a successful
721           // sync. What is coming out here is a failed sync, a sync
722           // that used to 'pass'.
723           LOG.info(e);
724         }
725       }
726 
727       ResultScanner scanner = table.getScanner(new Scan());
728       try {
729         for (int i=2; i<=5; i++) {
730           Result r = scanner.next();
731           assertNotNull(r);
732           assertFalse(r.isEmpty());
733           assertEquals("row100"+i, Bytes.toString(r.getRow()));
734         }
735       } finally {
736         scanner.close();
737       }
738 
739       // verify that no region servers aborted
740       for (JVMClusterUtil.RegionServerThread rsThread:
741         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
742         assertFalse(rsThread.getRegionServer().isAborted());
743       }
744     } finally {
745       if (t != null) t.close();
746     }
747   }
748 
749   /**
750    * Tests that logs are deleted when some region has a compaction
751    * record in WAL and no other records. See HBASE-8597.
752    */
753   @Test
754   public void testCompactionRecordDoesntBlockRolling() throws Exception {
755     Table table = null;
756 
757     // When the hbase:meta table can be opened, the region servers are running
758     Table t = new HTable(TEST_UTIL.getConfiguration(), TableName.META_TABLE_NAME);
759     try {
760       table = createTestTable(getName());
761 
762       server = TEST_UTIL.getRSForFirstRegionInTable(table.getName());
763       Region region = server.getOnlineRegions(table.getName()).get(0);
764       final WAL log = server.getWAL(region.getRegionInfo());
765       Store s = region.getStore(HConstants.CATALOG_FAMILY);
766 
767       //have to flush namespace to ensure it doesn't affect wall tests
768       admin.flush(TableName.NAMESPACE_TABLE_NAME);
769 
770       // Put some stuff into table, to make sure we have some files to compact.
771       for (int i = 1; i <= 2; ++i) {
772         doPut(table, i);
773         admin.flush(table.getName());
774       }
775       doPut(table, 3); // don't flush yet, or compaction might trigger before we roll WAL
776       assertEquals("Should have no WAL after initial writes", 0,
777           DefaultWALProvider.getNumRolledLogFiles(log));
778       assertEquals(2, s.getStorefilesCount());
779 
780       // Roll the log and compact table, to have compaction record in the 2nd WAL.
781       log.rollWriter();
782       assertEquals("Should have WAL; one table is not flushed", 1,
783           DefaultWALProvider.getNumRolledLogFiles(log));
784       admin.flush(table.getName());
785       region.compact(false);
786       // Wait for compaction in case if flush triggered it before us.
787       Assert.assertNotNull(s);
788       for (int waitTime = 3000; s.getStorefilesCount() > 1 && waitTime > 0; waitTime -= 200) {
789         Threads.sleepWithoutInterrupt(200);
790       }
791       assertEquals("Compaction didn't happen", 1, s.getStorefilesCount());
792 
793       // Write some value to the table so the WAL cannot be deleted until table is flushed.
794       doPut(table, 0); // Now 2nd WAL will have both compaction and put record for table.
795       log.rollWriter(); // 1st WAL deleted, 2nd not deleted yet.
796       assertEquals("Should have WAL; one table is not flushed", 1,
797           DefaultWALProvider.getNumRolledLogFiles(log));
798 
799       // Flush table to make latest WAL obsolete; write another record, and roll again.
800       admin.flush(table.getName());
801       doPut(table, 1);
802       log.rollWriter(); // Now 2nd WAL is deleted and 3rd is added.
803       assertEquals("Should have 1 WALs at the end", 1,
804           DefaultWALProvider.getNumRolledLogFiles(log));
805     } finally {
806       if (t != null) t.close();
807       if (table != null) table.close();
808     }
809   }
810 
811   private void doPut(Table table, int i) throws IOException {
812     Put put = new Put(Bytes.toBytes("row" + String.format("%1$04d", i)));
813     put.add(HConstants.CATALOG_FAMILY, null, value);
814     table.put(put);
815   }
816 
817   private Table createTestTable(String tableName) throws IOException {
818     // Create the test table and open it
819     HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(tableName));
820     desc.addFamily(new HColumnDescriptor(HConstants.CATALOG_FAMILY));
821     admin.createTable(desc);
822     return new HTable(TEST_UTIL.getConfiguration(), desc.getTableName());
823   }
824 }
825