View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.junit.Assert.assertTrue;
22  
23  import java.io.IOException;
24  import java.util.concurrent.CountDownLatch;
25  import java.util.concurrent.TimeUnit;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.hadoop.conf.Configuration;
30  import org.apache.hadoop.fs.FileSystem;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.CellScanner;
33  import org.apache.hadoop.hbase.ChoreService;
34  import org.apache.hadoop.hbase.CoordinatedStateManager;
35  import org.apache.hadoop.hbase.HBaseTestingUtility;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HTableDescriptor;
38  import org.apache.hadoop.hbase.Server;
39  import org.apache.hadoop.hbase.ServerName;
40  import org.apache.hadoop.hbase.TableName;
41  import org.apache.hadoop.hbase.client.ClusterConnection;
42  import org.apache.hadoop.hbase.client.Durability;
43  import org.apache.hadoop.hbase.client.Put;
44  import org.apache.hadoop.hbase.regionserver.wal.DamagedWALException;
45  import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
46  import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
47  import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
48  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
49  import org.apache.hadoop.hbase.testclassification.MediumTests;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
52  import org.apache.hadoop.hbase.util.Threads;
53  import org.apache.hadoop.hbase.wal.WAL;
54  import org.apache.hadoop.hbase.wal.WALKey;
55  import org.apache.hadoop.hbase.wal.WALProvider.Writer;
56  import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
57  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
58  import org.junit.After;
59  import org.junit.Before;
60  import org.junit.Rule;
61  import org.junit.Test;
62  import org.junit.experimental.categories.Category;
63  import org.junit.rules.TestName;
64  import org.mockito.Mockito;
65  
66  /**
67   * Testing for lock up of WAL subsystem.
68   * Copied from TestHRegion.
69   */
70  @Category({MediumTests.class})
71  public class TestWALLockup {
72    private static final Log LOG = LogFactory.getLog(TestWALLockup.class);
73    @Rule public TestName name = new TestName();
74  
75    private static final String COLUMN_FAMILY = "MyCF";
76    private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
77  
78    HRegion region = null;
79    // Do not run unit tests in parallel (? Why not?  It don't work?  Why not?  St.Ack)
80    private static HBaseTestingUtility TEST_UTIL;
81    private static Configuration CONF ;
82    private String dir;
83  
84    // Test names
85    protected TableName tableName;
86  
87    @Before
88    public void setup() throws IOException {
89      TEST_UTIL = HBaseTestingUtility.createLocalHTU();
90      CONF = TEST_UTIL.getConfiguration();
91      // Disable block cache.
92      CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
93      dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
94      tableName = TableName.valueOf(name.getMethodName());
95    }
96  
97    @After
98    public void tearDown() throws Exception {
99      EnvironmentEdgeManagerTestHelper.reset();
100     LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
101     TEST_UTIL.cleanupTestDir();
102   }
103 
104   String getName() {
105     return name.getMethodName();
106   }
107 
108   /**
109    * Reproduce locking up that happens when we get an inopportune sync during setup for
110    * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
111    * it is locked up.
112    * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
113    * set up a dodgy WAL that will throw an exception when we go to append to it.
114    */
115   @Test (timeout=20000)
116   public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
117     // A WAL that we can have throw exceptions when a flag is set.
118     class DodgyFSLog extends FSHLog {
119       // Set this when want the WAL to start throwing exceptions.
120       volatile boolean throwException = false;
121 
122       // Latch to hold up processing until after another operation has had time to run.
123       CountDownLatch latch = new CountDownLatch(1);
124 
125       public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
126       throws IOException {
127         super(fs, root, logDir, conf);
128       }
129 
130       @Override
131       protected void afterCreatingZigZagLatch() {
132         // If throwException set, then append will throw an exception causing the WAL to be
133         // rolled. We'll come in here. Hold up processing until a sync can get in before
134         // the zigzag has time to complete its setup and get its own sync in. This is what causes
135         // the lock up we've seen in production.
136         if (throwException) {
137           try {
138             LOG.info("LATCHED");
139             // So, timing can have it that the test can run and the bad flush below happens
140             // before we get here. In this case, we'll be stuck waiting on this latch but there
141             // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
142             // because all WALs have rolled. In this case, just give up on test.
143             if (!this.latch.await(5, TimeUnit.SECONDS)) {
144               LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
145             }
146           } catch (InterruptedException e) {
147             // TODO Auto-generated catch block
148             e.printStackTrace();
149           }
150         }
151       }
152 
153       @Override
154       protected void beforeWaitOnSafePoint() {
155         if (throwException) {
156           LOG.info("COUNTDOWN");
157           // Don't countdown latch until someone waiting on it otherwise, the above
158           // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
159           // be stuck; test won't go down
160           while (this.latch.getCount() <= 0) Threads.sleep(1);
161           this.latch.countDown();
162         }
163       }
164 
165       @Override
166       protected Writer createWriterInstance(Path path) throws IOException {
167         final Writer w = super.createWriterInstance(path);
168         return new Writer() {
169           @Override
170           public void close() throws IOException {
171             w.close();
172           }
173 
174           @Override
175           public void sync(boolean forceSync) throws IOException {
176             if (throwException) {
177               throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
178             }
179             w.sync(forceSync);
180           }
181 
182           @Override
183           public void append(Entry entry) throws IOException {
184             if (throwException) {
185               throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
186             }
187             w.append(entry);
188           }
189 
190           @Override
191           public long getLength() throws IOException {
192             return w.getLength();
193           }
194         };
195       }
196     }
197 
198     // Mocked up server and regionserver services. Needed below.
199     Server server = Mockito.mock(Server.class);
200     Mockito.when(server.getConfiguration()).thenReturn(CONF);
201     Mockito.when(server.isStopped()).thenReturn(false);
202     Mockito.when(server.isAborted()).thenReturn(false);
203     RegionServerServices services = Mockito.mock(RegionServerServices.class);
204 
205     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
206     FileSystem fs = FileSystem.get(CONF);
207     Path rootDir = new Path(dir + getName());
208     DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
209     Path originalWAL = dodgyWAL.getCurrentFileName();
210     // I need a log roller running.
211     LogRoller logRoller = new LogRoller(server, services);
212     logRoller.addWAL(dodgyWAL);
213     // There is no 'stop' once a logRoller is running.. it just dies.
214     logRoller.start();
215     // Now get a region and start adding in edits.
216     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
217     final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
218     byte [] bytes = Bytes.toBytes(getName());
219     MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
220     try {
221       // First get something into memstore. Make a Put and then pull the Cell out of it. Will
222       // manage append and sync carefully in below to manufacture hang. We keep adding same
223       // edit. WAL subsystem doesn't care.
224       Put put = new Put(bytes);
225       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
226       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
227           htd.getTableName(), System.currentTimeMillis(), mvcc);
228       WALEdit edit = new WALEdit();
229       CellScanner CellScanner = put.cellScanner();
230       assertTrue(CellScanner.advance());
231       edit.add(CellScanner.current());
232       // Put something in memstore and out in the WAL. Do a big number of appends so we push
233       // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
234       for (int i = 0; i < 1000; i++) {
235         region.put(put);
236       }
237       // Set it so we start throwing exceptions.
238       LOG.info("SET throwing of exception on append");
239       dodgyWAL.throwException = true;
240       // This append provokes a WAL roll request
241       dodgyWAL.append(htd, region.getRegionInfo(), key, edit, true);
242       boolean exception = false;
243       try {
244         dodgyWAL.sync();
245       } catch (Exception e) {
246         exception = true;
247       }
248       assertTrue("Did not get sync exception", exception);
249 
250       // Get a memstore flush going too so we have same hung profile as up in the issue over
251       // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
252       // by the zigzaglatch waiting on syncs to come home.
253       Thread t = new Thread ("Flusher") {
254         public void run() {
255           try {
256             if (region.getMemstoreSize() <= 0) {
257               throw new IOException("memstore size=" + region.getMemstoreSize());
258             }
259             region.flush(false);
260           } catch (IOException e) {
261             // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
262             // when roll completes.
263             LOG.info("In flush", e);
264           }
265           LOG.info("Exiting");
266         };
267       };
268       t.setDaemon(true);
269       t.start();
270       // Wait until 
271       while (dodgyWAL.latch.getCount() > 0) Threads.sleep(1);
272       // Now assert I got a new WAL file put in place even though loads of errors above.
273       assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
274       // Can I append to it?
275       dodgyWAL.throwException = false;
276       try {
277         region.put(put);
278       } catch (Exception e) {
279         LOG.info("In the put", e);
280       }
281     } finally {
282       // To stop logRoller, its server has to say it is stopped.
283       Mockito.when(server.isStopped()).thenReturn(true);
284       if (logRoller != null) logRoller.interrupt();
285       try {
286         if (region != null) region.close();
287         if (dodgyWAL != null) dodgyWAL.close();
288       } catch (Exception e) {
289         LOG.info("On way out", e);
290       }
291     }
292   }
293 
294   /**
295    * Reproduce locking up that happens when there's no further syncs after
296    * append fails, and causing an isolated sync then infinite wait. See
297    * HBASE-16960. If below is broken, we will see this test timeout because it
298    * is locked up.
299    * <p/>
300    * Steps for reproduce:<br/>
301    * 1. Trigger server abort through dodgyWAL1<br/>
302    * 2. Add a {@link DummyWALActionsListener} to dodgyWAL2 to cause ringbuffer
303    * event handler thread sleep for a while thus keeping {@code endOfBatch}
304    * false<br/>
305    * 3. Publish a sync then an append which will throw exception, check whether
306    * the sync could return
307    */
308   @Test(timeout = 20000)
309   public void testLockup16960() throws IOException {
310     // A WAL that we can have throw exceptions when a flag is set.
311     class DodgyFSLog extends FSHLog {
312       // Set this when want the WAL to start throwing exceptions.
313       volatile boolean throwException = false;
314 
315       public DodgyFSLog(FileSystem fs, Path root, String logDir,
316           Configuration conf) throws IOException {
317         super(fs, root, logDir, conf);
318       }
319 
320       @Override
321       protected Writer createWriterInstance(Path path) throws IOException {
322         final Writer w = super.createWriterInstance(path);
323         return new Writer() {
324           @Override
325           public void close() throws IOException {
326             w.close();
327           }
328 
329           @Override
330           public void sync(boolean forceSync) throws IOException {
331             if (throwException) {
332               throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
333             }
334             w.sync(forceSync);
335           }
336 
337           @Override
338           public void append(Entry entry) throws IOException {
339             if (throwException) {
340               throw new IOException(
341                   "FAKE! Failed to replace a bad datanode...APPEND");
342             }
343             w.append(entry);
344           }
345 
346           @Override
347           public long getLength() throws IOException {
348             return w.getLength();
349           }
350         };
351       }
352 
353       @Override
354       public byte[][] rollWriter(boolean force) throws FailedLogCloseException,
355           IOException {
356         if (throwException) {
357           throw new FailedLogCloseException("testLockup16960");
358         }
359         return super.rollWriter(force);
360       }
361     }
362 
363     // Mocked up server and regionserver services. Needed below.
364     Server server = new DummyServer(CONF, ServerName.valueOf(
365         "hostname1.example.org", 1234, 1L).toString());
366     RegionServerServices services = Mockito.mock(RegionServerServices.class);
367 
368     CONF.setLong("hbase.regionserver.hlog.sync.timeout", 10000);
369 
370     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL,
371     // go ahead with test.
372     FileSystem fs = FileSystem.get(CONF);
373     Path rootDir = new Path(dir + getName());
374     DodgyFSLog dodgyWAL1 = new DodgyFSLog(fs, rootDir, getName(), CONF);
375 
376     Path rootDir2 = new Path(dir + getName() + "2");
377     final DodgyFSLog dodgyWAL2 = new DodgyFSLog(fs, rootDir2, getName() + "2",
378         CONF);
379     // Add a listener to force ringbuffer event handler sleep for a while
380     dodgyWAL2.registerWALActionsListener(new DummyWALActionsListener());
381 
382     // I need a log roller running.
383     LogRoller logRoller = new LogRoller(server, services);
384     logRoller.addWAL(dodgyWAL1);
385     logRoller.addWAL(dodgyWAL2);
386     // There is no 'stop' once a logRoller is running.. it just dies.
387     logRoller.start();
388     // Now get a region and start adding in edits.
389     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
390     final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL1);
391     byte[] bytes = Bytes.toBytes(getName());
392     MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
393     try {
394       Put put = new Put(bytes);
395       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
396       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
397           htd.getTableName(), System.currentTimeMillis(), mvcc);
398       WALEdit edit = new WALEdit();
399       CellScanner CellScanner = put.cellScanner();
400       assertTrue(CellScanner.advance());
401       edit.add(CellScanner.current());
402 
403       LOG.info("SET throwing of exception on append");
404       dodgyWAL1.throwException = true;
405       // This append provokes a WAL roll request
406       dodgyWAL1.append(htd, region.getRegionInfo(), key, edit, true);
407       boolean exception = false;
408       try {
409         dodgyWAL1.sync();
410       } catch (Exception e) {
411         exception = true;
412       }
413       assertTrue("Did not get sync exception", exception);
414 
415       // LogRoller call dodgyWAL1.rollWriter get FailedLogCloseException and
416       // cause server abort.
417       try {
418         // wait LogRoller exit.
419         Thread.sleep(50);
420       } catch (InterruptedException e) {
421         e.printStackTrace();
422       }
423 
424       final CountDownLatch latch = new CountDownLatch(1);
425 
426       // make RingBufferEventHandler sleep 1s, so the following sync
427       // endOfBatch=false
428       key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
429           TableName.valueOf("sleep"), System.currentTimeMillis(), mvcc);
430       dodgyWAL2.append(htd, region.getRegionInfo(), key, edit, true);
431 
432       Thread t = new Thread("Sync") {
433         public void run() {
434           try {
435             dodgyWAL2.sync();
436           } catch (IOException e) {
437             LOG.info("In sync", e);
438           }
439           latch.countDown();
440           LOG.info("Sync exiting");
441         };
442       };
443       t.setDaemon(true);
444       t.start();
445       try {
446         // make sure sync have published.
447         Thread.sleep(100);
448       } catch (InterruptedException e1) {
449         e1.printStackTrace();
450       }
451       // make append throw DamagedWALException
452       key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
453           TableName.valueOf("DamagedWALException"), System.currentTimeMillis(), mvcc);
454       dodgyWAL2.append(htd, region.getRegionInfo(), key, edit, true);
455 
456       while (latch.getCount() > 0) {
457         Threads.sleep(100);
458       }
459       assertTrue(server.isAborted());
460     } finally {
461       if (logRoller != null) {
462         logRoller.interrupt();
463       }
464       try {
465         if (region != null) {
466           region.close();
467         }
468         if (dodgyWAL1 != null) {
469           dodgyWAL1.close();
470         }
471         if (dodgyWAL2 != null) {
472           dodgyWAL2.close();
473         }
474       } catch (Exception e) {
475         LOG.info("On way out", e);
476       }
477     }
478   }
479 
480   static class DummyServer implements Server {
481     private Configuration conf;
482     private String serverName;
483     private boolean isAborted = false;
484 
485     public DummyServer(Configuration conf, String serverName) {
486       this.conf = conf;
487       this.serverName = serverName;
488     }
489 
490     @Override
491     public Configuration getConfiguration() {
492       return conf;
493     }
494 
495     @Override
496     public ZooKeeperWatcher getZooKeeper() {
497       return null;
498     }
499 
500     @Override
501     public CoordinatedStateManager getCoordinatedStateManager() {
502       return null;
503     }
504 
505     @Override
506     public ClusterConnection getConnection() {
507       return null;
508     }
509 
510     @Override
511     public MetaTableLocator getMetaTableLocator() {
512       return null;
513     }
514 
515     @Override
516     public ServerName getServerName() {
517       return ServerName.valueOf(this.serverName);
518     }
519 
520     @Override
521     public void abort(String why, Throwable e) {
522       LOG.info("Aborting " + serverName);
523       this.isAborted = true;
524     }
525 
526     @Override
527     public boolean isAborted() {
528       return this.isAborted;
529     }
530 
531     @Override
532     public void stop(String why) {
533       this.isAborted = true;
534     }
535 
536     @Override
537     public boolean isStopped() {
538       return this.isAborted;
539     }
540 
541     @Override
542     public ChoreService getChoreService() {
543       return null;
544     }
545 
546   }
547 
548   static class DummyWALActionsListener extends WALActionsListener.Base {
549 
550     @Override
551     public void visitLogEntryBeforeWrite(HTableDescriptor htd, WALKey logKey,
552         WALEdit logEdit) throws IOException {
553       if (logKey.getTablename().getNameAsString().equalsIgnoreCase("sleep")) {
554         try {
555           Thread.sleep(1000);
556         } catch (InterruptedException e) {
557           e.printStackTrace();
558         }
559       }
560       if (logKey.getTablename().getNameAsString()
561           .equalsIgnoreCase("DamagedWALException")) {
562         throw new DamagedWALException("Failed appending");
563       }
564     }
565 
566   }
567 
568   /**
569    * @return A region on which you must call
570    *         {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done.
571    */
572   public HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
573       Configuration conf, WAL wal) throws IOException {
574     return TEST_UTIL.createLocalHRegion(tableName.getName(), startKey, stopKey,
575       getName(), conf, false, Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
576   }
577 }