MB-20869: Simplify test to fix intermittent test_access_scanner failure 84/67884/11
authorDave Rigby <daver@couchbase.com>
Wed, 7 Sep 2016 14:40:08 +0000 (15:40 +0100)
committerDave Rigby <daver@couchbase.com>
Thu, 29 Sep 2016 12:36:56 +0000 (12:36 +0000)
In test_access_scanner the check that the number of non-resident items
was less than 6% of the total is incorrect - it was using integer
division (6/100) and hence the RHS of the check was always zero, which
would always succeed. Fix the check so it calculates the percentage
correctly.

Additionally we did not wait for the previous AccessScanner run to
finalize - which meant that the second start of the access acanner
could fail to start (and consequently we would timeout with the
message:

    Exceeded maximum wait time of 60000000us waiting for stat
    'ep_num_access_scanner_runs' to be greater or equal than 8 -
    aborting.

To solve this, change to just one iteration of creating the file (not
sure why it did two before), and move the increment of stats.alogRuns
to /after/ the access log files are renamed.

Change-Id: I6c3b6eeed91302fd7fa67dfade6ae954078fc125
Reviewed-on: http://review.couchbase.org/67884
Reviewed-by: Manu Dhundi <manu@couchbase.com>
Tested-by: buildbot <build@couchbase.com>
src/access_scanner.cc
tests/ep_test_apis.cc
tests/ep_testsuite.cc

index 2a3918e..cce92b7 100644 (file)
@@ -85,13 +85,14 @@ public:
     virtual void complete() {
         update();
 
-        if (log != NULL) {
+        if (log == nullptr) {
+            updateStateFinalizer(false);
+        } else {
             size_t num_items = log->itemsLogged[ML_NEW];
             log->commit1();
             log->commit2();
             delete log;
             log = NULL;
-            ++stats.alogRuns;
             stats.alogRuntime.store(ep_real_time() - startTime);
             stats.alogNumItems.store(num_items);
             stats.accessScannerHisto.add((gethrtime() - taskStart) / 1000);
@@ -100,7 +101,7 @@ public:
                 LOG(EXTENSION_LOG_NOTICE, "The new access log file is empty. "
                     "Delete it without replacing the current access log...");
                 remove(next.c_str());
-                updateStateFinalizer();
+                updateStateFinalizer(true);
                 return;
             }
 
@@ -108,7 +109,7 @@ public:
                 LOG(EXTENSION_LOG_WARNING, "Failed to remove access log file "
                     "'%s': %s", prev.c_str(), strerror(errno));
                 remove(next.c_str());
-                updateStateFinalizer();
+                updateStateFinalizer(true);
                 return;
             }
             LOG(EXTENSION_LOG_NOTICE, "Removed old access log file: '%s'",
@@ -119,7 +120,7 @@ public:
                     "from '%s' to '%s': %s", name.c_str(), prev.c_str(),
                     strerror(errno));
                 remove(next.c_str());
-                updateStateFinalizer();
+                updateStateFinalizer(true);
                 return;
             }
             LOG(EXTENSION_LOG_NOTICE, "Renamed access log file from '%s' to "
@@ -129,23 +130,34 @@ public:
                     "from '%s' to '%s': %s", next.c_str(), name.c_str(),
                     strerror(errno));
                 remove(next.c_str());
-                updateStateFinalizer();
+                updateStateFinalizer(true);
                 return;
             }
             LOG(EXTENSION_LOG_NOTICE, "New access log file '%s' created with "
                 "%" PRIu64 " keys", name.c_str(),
                 static_cast<uint64_t>(num_items));
+            updateStateFinalizer(true);
         }
-
-        updateStateFinalizer();
     }
 
 private:
-    void updateStateFinalizer() {
+    /**
+     * Finalizer method called at the end of completing a visit.
+     * @param created_log: Did we successfully create a MutationLog object on
+     * this run?
+     */
+    void updateStateFinalizer(bool created_log) {
         if (++(as.completedCount) == store.getVBuckets().getNumShards()) {
             bool inverse = false;
             stateFinalizer.compare_exchange_strong(inverse, true);
         }
+        if (created_log) {
+            // Successfully created an access log - increment stat.
+            // Done after the new file created
+            // to aid in testing - once the stat has the new value the access.log
+            // file can be safely checked.
+            ++stats.alogRuns;
+        }
     }
 
     EventuallyPersistentStore &store;
index 35acbd5..cd1c717 100644 (file)
@@ -1148,6 +1148,12 @@ uint64_t get_stat(ENGINE_HANDLE* h, ENGINE_HANDLE_V1* h1,
 }
 
 template<>
+bool get_stat(ENGINE_HANDLE* h, ENGINE_HANDLE_V1* h1,
+              const char* statname, const char* statkey) {
+    return get_str_stat(h, h1, statname, statkey) == "true";
+}
+
+template<>
 std::string get_stat(ENGINE_HANDLE* h, ENGINE_HANDLE_V1* h1,
                      const char* statname, const char* statkey) {
     std::lock_guard<std::mutex> lh(vals_mutex);
index 7932290..6900fc1 100644 (file)
@@ -2800,11 +2800,12 @@ static enum test_result test_access_scanner(ENGINE_HANDLE *h,
 
     const int num_shards = get_int_stat(h, h1, "ep_workload:num_shards",
                                         "workload");
-    int access_scanner_skips = 0, alog_runs = 0;
     name = name + ".0";
     std::string prev(name + ".old");
 
-    /* Get the resident ratio down to below 90% */
+    /* Get the resident ratio down to below 95% - point at which access.log
+     * generation occurs.
+     */
     int num_items = 0;
     while (true) {
         // Gathering stats on every store is expensive, just check every 100 iterations
@@ -2827,19 +2828,18 @@ static enum test_result test_access_scanner(ENGINE_HANDLE *h,
     wait_for_flusher_to_settle(h, h1);
     verify_curr_items(h, h1, num_items, "Wrong number of items");
     int num_non_resident = get_int_stat(h, h1, "vb_active_num_non_resident");
-    cb_assert(num_non_resident >= ((float)(6/100) * num_items));
+    checkge(num_non_resident, num_items * 6 / 100,
+            "Expected num_non_resident to be at least 6% of total items");
 
-    /* Run access scanner task twice and expect it to generate access log */
-    for(int i = 0; i < 2; i++) {
-        alog_runs = get_int_stat(h, h1, "ep_num_access_scanner_runs");
-        check(set_param(h, h1, protocol_binary_engine_param_flush,
-                        "access_scanner_run", "true"),
-              "Failed to trigger access scanner");
-        wait_for_stat_to_be_gte(h, h1, "ep_num_access_scanner_runs",
-                                alog_runs + num_shards);
-    }
+    /* Run access scanner task once and expect it to generate access log */
+    check(set_param(h, h1, protocol_binary_engine_param_flush,
+                    "access_scanner_run", "true"),
+          "Failed to trigger access scanner");
+
+    // Wait for the number of runs to equal the number of shards.
+    wait_for_stat_to_be(h, h1, "ep_num_access_scanner_runs", num_shards);
 
-    /* This time since resident ratio is < 90% access log should be generated */
+    /* This time since resident ratio is < 95% access log should be generated */
     checkeq(0, access(name.c_str(), F_OK), "access log file should exist");
 
     /* Increase resident ratio by deleting items */
@@ -2848,7 +2848,8 @@ static enum test_result test_access_scanner(ENGINE_HANDLE *h,
           "Failed to set VB0 state.");
 
     /* Run access scanner task once */
-    access_scanner_skips = get_int_stat(h, h1, "ep_num_access_scanner_skips");
+    const int access_scanner_skips =
+            get_int_stat(h, h1, "ep_num_access_scanner_skips");
     check(set_param(h, h1, protocol_binary_engine_param_flush,
                     "access_scanner_run", "true"),
           "Failed to trigger access scanner");