Revert the commits for the shard_cache_mutex option.

Change-Id: I663349ffcc7215dd4b8478da8d34a6b67effff47
Reviewed-on: https://code-review.googlesource.com/c/36650
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/Makefile b/Makefile
index 110ca73..f001f06 100644
--- a/Makefile
+++ b/Makefile
@@ -7,11 +7,6 @@
 # CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU
 # LDICU=$(shell pkg-config icu-uc --libs)
 
-# To use NUMA topology to shard the DFA state cache mutex,
-# uncomment the next two lines:
-# CCNUMA=-DRE2_USE_NUMA
-# LDNUMA=-lnuma
-
 # To build against PCRE for testing or benchmarking,
 # uncomment the next two lines:
 # CCPCRE=-I/usr/local/include -DUSEPCRE
@@ -22,8 +17,8 @@
 CXXFLAGS?=-O3 -g
 LDFLAGS?=
 # required
-RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCNUMA) $(CCPCRE)
-RE2_LDFLAGS?=-pthread $(LDICU) $(LDNUMA) $(LDPCRE)
+RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE)
+RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE)
 AR?=ar
 ARFLAGS?=rsc
 NM?=nm
@@ -292,7 +287,7 @@
 	@echo
 
 static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDNUMA) $(LDFLAGS)
+static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
 static-testinstall:
 	@mkdir -p obj
 	@cp testinstall.cc obj
@@ -306,7 +301,7 @@
 endif
 
 shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDNUMA) $(LDFLAGS)
+shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
 shared-testinstall:
 	@mkdir -p obj
 	@cp testinstall.cc obj
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 5574071..9bc8499 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -36,7 +36,6 @@
 #include <utility>
 #include <vector>
 
-#include "util/util.h"
 #include "util/logging.h"
 #include "util/mix.h"
 #include "util/mutex.h"
@@ -45,11 +44,6 @@
 #include "re2/prog.h"
 #include "re2/stringpiece.h"
 
-#if defined(RE2_USE_NUMA)
-#include <numa.h>
-#include <sched.h>
-#endif
-
 // Silence "zero-sized array in struct/union" warning for DFA::State::next_.
 #ifdef _MSC_VER
 #pragma warning(disable: 4200)
@@ -273,7 +267,7 @@
     bool run_forward;
     State* start;
     int first_byte;
-    RWLocker* cache_lock;
+    RWLocker *cache_lock;
     bool failed;     // "out" parameter: whether search gave up
     const char* ep;  // "out" parameter: end pointer for match
     SparseSet* matches;
@@ -343,12 +337,6 @@
     return prog_->bytemap()[c];
   }
 
-  // Determines how many ways to shard cache_mutex_.
-  int CacheMutexCount() const;
-
-  // Determines which shard of cache_mutex_ to use.
-  int WhichCacheMutex() const;
-
   // Constant after initialization.
   Prog* prog_;              // The regular expression program to run.
   Prog::MatchKind kind_;    // The kind of DFA.
@@ -363,32 +351,18 @@
   int nastack_;
 
   // State* cache.  Many threads use and add to the cache simultaneously,
-  // holding one cache_mutex_ for reading and mutex_ (above) when adding.
+  // holding cache_mutex_ for reading and mutex_ (above) when adding.
   // If the cache fills and needs to be discarded, the discarding is done
-  // while holding every cache_mutex_ for writing to avoid disrupting any
-  // readers.  Any State* is valid only while one (or every) cache_mutex_
-  // is held by the thread using the State*.
-  class alignas(CACHELINE_SIZE) AlignedMutex : public Mutex {};
-  AlignedMutex* cache_mutex_;
+  // while holding cache_mutex_ for writing, to avoid interrupting other
+  // readers.  Any State* pointers are only valid while cache_mutex_
+  // is held.
+  Mutex cache_mutex_;
   int64_t mem_budget_;     // Total memory budget for all States.
   int64_t state_budget_;   // Amount of memory remaining for new States.
   StateSet state_cache_;   // All States computed so far.
   StartInfo start_[kMaxStart];
-
-  // Until we can use C++17, we must handle the alignment ourselves. :(
-  // Someday, std::unique_ptr<AlignedMutex[]> will be quite sufficient.
-  char* cache_mutex_storage_;
-  int cache_mutex_count_;
 };
 
-template <typename T>
-static inline T* Align(T* base, size_t align) {
-  intptr_t tmp = reinterpret_cast<intptr_t>(base);
-  tmp += align - 1;
-  tmp &= ~(align - 1);
-  return reinterpret_cast<T*>(tmp);
-}
-
 // Shorthand for casting to uint8_t*.
 static inline const uint8_t* BytePtr(const void* v) {
   return reinterpret_cast<const uint8_t*>(v);
@@ -468,9 +442,7 @@
     q0_(NULL),
     q1_(NULL),
     astack_(NULL),
-    mem_budget_(max_mem),
-    cache_mutex_storage_(NULL),
-    cache_mutex_count_(CacheMutexCount()) {
+    mem_budget_(max_mem) {
   if (ExtraDebug)
     fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
   int nmark = 0;
@@ -482,13 +454,11 @@
              prog_->inst_count(kInstNop) +
              nmark + 1;  // + 1 for start inst
 
-  // Account for memory needed for DFA, q0, q1, astack, cache mutexes.
+  // Account for space needed for DFA, q0, q1, astack.
   mem_budget_ -= sizeof(DFA);
   mem_budget_ -= (prog_->size() + nmark) *
                  (sizeof(int)+sizeof(int)) * 2;  // q0, q1
   mem_budget_ -= nastack_ * sizeof(int);  // astack
-  mem_budget_ -= alignof(AlignedMutex) +
-                 sizeof(AlignedMutex) * cache_mutex_count_;  // cache mutexes
   if (mem_budget_ < 0) {
     init_failed_ = true;
     return;
@@ -513,23 +483,12 @@
   q0_ = new Workq(prog_->size(), nmark);
   q1_ = new Workq(prog_->size(), nmark);
   astack_ = new int[nastack_];
-
-  char* storage = new char[alignof(AlignedMutex) +
-                           sizeof(AlignedMutex) * cache_mutex_count_];
-  cache_mutex_storage_ = storage;
-  cache_mutex_ = new (Align(storage, alignof(AlignedMutex)))
-      AlignedMutex[cache_mutex_count_]();
 }
 
 DFA::~DFA() {
-  if (cache_mutex_storage_ != NULL) {
-    for (int i = 0; i < cache_mutex_count_; i++)
-      cache_mutex_[i].~AlignedMutex();
-  }
-  delete[] cache_mutex_storage_;
-  delete[] astack_;
-  delete q1_;
   delete q0_;
+  delete q1_;
+  delete[] astack_;
   ClearCache();
 }
 
@@ -811,8 +770,8 @@
   mem_budget_ -= mem + kStateCacheOverhead;
 
   // Allocate new state along with room for next_ and inst_.
-  char* storage = std::allocator<char>().allocate(mem);
-  State* s = new (storage) State;
+  char* space = std::allocator<char>().allocate(mem);
+  State* s = new (space) State;
   (void) new (s->next_) std::atomic<State*>[nnext];
   // Work around a unfortunate bug in older versions of libstdc++.
   // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
@@ -1152,7 +1111,7 @@
 
 class DFA::RWLocker {
  public:
-  explicit RWLocker(DFA* dfa);
+  explicit RWLocker(Mutex* mu);
   ~RWLocker();
 
   // If the lock is only held for reading right now,
@@ -1162,39 +1121,35 @@
   void LockForWriting();
 
  private:
-  DFA* dfa_;
-  int which_;
+  Mutex* mu_;
   bool writing_;
 
   RWLocker(const RWLocker&) = delete;
   RWLocker& operator=(const RWLocker&) = delete;
 };
 
-DFA::RWLocker::RWLocker(DFA* dfa)
-    : dfa_(dfa), which_(dfa_->WhichCacheMutex()), writing_(false) {
-  dfa_->cache_mutex_[which_].ReaderLock();
+DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) {
+  mu_->ReaderLock();
 }
 
 // This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations
 // does not support lock upgrade.
 void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
   if (!writing_) {
-    dfa_->cache_mutex_[which_].ReaderUnlock();
-    for (int i = 0; i < dfa_->cache_mutex_count_; i++)
-      dfa_->cache_mutex_[i].WriterLock();
+    mu_->ReaderUnlock();
+    mu_->WriterLock();
     writing_ = true;
   }
 }
 
 DFA::RWLocker::~RWLocker() {
-  if (!writing_) {
-    dfa_->cache_mutex_[which_].ReaderUnlock();
-  } else {
-    for (int i = 0; i < dfa_->cache_mutex_count_; i++)
-      dfa_->cache_mutex_[i].WriterUnlock();
-  }
+  if (!writing_)
+    mu_->ReaderUnlock();
+  else
+    mu_->WriterUnlock();
 }
 
+
 // When the DFA's State cache fills, we discard all the states in the
 // cache and start over.  Many threads can be using and adding to the
 // cache at the same time, so we synchronize using the cache_mutex_
@@ -1820,7 +1775,7 @@
             run_forward, kind_);
   }
 
-  RWLocker l(this);
+  RWLocker l(&cache_mutex_);
   SearchParams params(text, context, &l);
   params.anchored = anchored;
   params.want_earliest_match = want_earliest_match;
@@ -1970,7 +1925,7 @@
 
   // Pick out start state for unanchored search
   // at beginning of text.
-  RWLocker l(this);
+  RWLocker l(&cache_mutex_);
   SearchParams params(StringPiece(), StringPiece(), &l);
   params.anchored = false;
   if (!AnalyzeSearch(&params) ||
@@ -2062,7 +2017,7 @@
   std::unordered_map<State*, int> previously_visited_states;
 
   // Pick out start state for anchored search at beginning of text.
-  RWLocker l(this);
+  RWLocker l(&cache_mutex_);
   SearchParams params(StringPiece(), StringPiece(), &l);
   params.anchored = true;
   if (!AnalyzeSearch(&params))
@@ -2185,55 +2140,4 @@
   return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
 }
 
-#if defined(RE2_USE_NUMA)
-static int numa_max_count;
-static std::vector<int>* numa_cpu_map;
-
-void InitNUMA() {
-  numa_max_count = 0;
-  numa_cpu_map = new std::vector<int>;
-  if (numa_available() == -1)
-    return;
-  int nodes = numa_num_configured_nodes();
-  if (nodes < 1)
-    return;
-  int cpus = numa_num_configured_cpus();
-  if (cpus < 1)
-    return;
-  std::vector<int> count;
-  count.resize(nodes);
-  for (int cpu = 0; cpu < cpus; cpu++) {
-    int node = numa_node_of_cpu(cpu);
-    numa_cpu_map->emplace_back(count[node]++);
-  }
-  numa_max_count = *std::max_element(count.begin(), count.end());
-}
-#endif
-
-int DFA::CacheMutexCount() const {
-#if !defined(RE2_USE_NUMA)
-  return 1;
-#else
-  if (!prog_->shard_cache_mutex())
-    return 1;
-  static std::once_flag numa_once;
-  std::call_once(numa_once, &InitNUMA);
-  if (numa_max_count == 0)
-    return 1;
-  return numa_max_count;
-#endif
-}
-
-int DFA::WhichCacheMutex() const {
-#if !defined(RE2_USE_NUMA)
-  return 0;
-#else
-  if (!prog_->shard_cache_mutex())
-    return 0;
-  if (numa_cpu_map->empty())
-    return 0;
-  return (*numa_cpu_map)[sched_getcpu()];
-#endif
-}
-
 }  // namespace re2
diff --git a/re2/prog.cc b/re2/prog.cc
index 1103111..b0ae375 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -115,7 +115,6 @@
     inst_(NULL),
     onepass_nodes_(NULL),
     dfa_mem_(0),
-    shard_cache_mutex_(false),
     dfa_first_(NULL),
     dfa_longest_(NULL) {
 }
diff --git a/re2/prog.h b/re2/prog.h
index 3bfc9e9..6d48135 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -200,8 +200,6 @@
   int inst_count(InstOp op) { return inst_count_[op]; }
   void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
   int64_t dfa_mem() { return dfa_mem_; }
-  void set_shard_cache_mutex(bool b) { shard_cache_mutex_ = b; }
-  bool shard_cache_mutex() { return shard_cache_mutex_; }
   int flags() { return flags_; }
   void set_flags(int flags) { flags_ = flags; }
   bool anchor_start() { return anchor_start_; }
@@ -401,7 +399,6 @@
   uint8_t* onepass_nodes_;  // data for OnePass nodes
 
   int64_t dfa_mem_;         // Maximum memory for DFAs.
-  bool shard_cache_mutex_;  // shard the DFA state cache mutex
   DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
   DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
 
diff --git a/re2/re2.cc b/re2/re2.cc
index 82911bc..f3f96ae 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -49,7 +49,6 @@
     dot_nl_(false),
     never_capture_(false),
     case_sensitive_(true),
-    shard_cache_mutex_(false),
     perl_classes_(false),
     word_boundary_(false),
     one_line_(false) {
@@ -211,9 +210,7 @@
   // one third to the reverse prog, because the forward
   // Prog has two DFAs but the reverse prog has one.
   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
-  if (prog_ != NULL) {
-    prog_->set_shard_cache_mutex(options_.shard_cache_mutex());
-  } else {
+  if (prog_ == NULL) {
     if (options_.log_errors())
       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
     error_ = new string("pattern too large - compile failed");
@@ -234,9 +231,7 @@
   std::call_once(rprog_once_, [](const RE2* re) {
     re->rprog_ =
         re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
-    if (re->rprog_ != NULL) {
-      re->rprog_->set_shard_cache_mutex(re->options_.shard_cache_mutex());
-    } else {
+    if (re->rprog_ == NULL) {
       if (re->options_.log_errors())
         LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
       re->error_ = new string("pattern too large - reverse compile failed");
diff --git a/re2/re2.h b/re2/re2.h
index 5ea8efd..c93de56 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -560,7 +560,6 @@
     //   never_capture    (false) parse all parens as non-capturing
     //   case_sensitive   (true)  match is case-sensitive (regexp can override
     //                              with (?i) unless in posix_syntax mode)
-    //   shard_cache_mutex (false) shard the DFA state cache mutex (see below)
     //
     // The following options are only consulted when posix_syntax == true.
     // When posix_syntax == false, these features are always enabled and
@@ -597,11 +596,6 @@
     //
     // Once a DFA fills its budget, it flushes its cache and starts over.
     // If this happens too often, RE2 falls back on the NFA implementation.
-    //
-    // The shard_cache_mutex option causes RE2 to shard the DFA state cache
-    // mutex N ways; N will be determined automatically. This increases the
-    // memory footprint somewhat, so it should not be used unless profiling
-    // shows that reader lock contention is incurring substantial overhead.
 
     // For now, make the default budget something close to Code Search.
     static const int kDefaultMaxMem = 8<<20;
@@ -622,7 +616,6 @@
       dot_nl_(false),
       never_capture_(false),
       case_sensitive_(true),
-      shard_cache_mutex_(false),
       perl_classes_(false),
       word_boundary_(false),
       one_line_(false) {
@@ -671,9 +664,6 @@
     bool case_sensitive() const { return case_sensitive_; }
     void set_case_sensitive(bool b) { case_sensitive_ = b; }
 
-    bool shard_cache_mutex() const { return shard_cache_mutex_; }
-    void set_shard_cache_mutex(bool b) { shard_cache_mutex_ = b; }
-
     bool perl_classes() const { return perl_classes_; }
     void set_perl_classes(bool b) { perl_classes_ = b; }
 
@@ -700,7 +690,6 @@
     bool dot_nl_;
     bool never_capture_;
     bool case_sensitive_;
-    bool shard_cache_mutex_;
     bool perl_classes_;
     bool word_boundary_;
     bool one_line_;
diff --git a/re2/set.cc b/re2/set.cc
index a624d6a..9890de9 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -99,8 +99,6 @@
   delete[] sub;
 
   prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
-  if (prog_ != NULL)
-    prog_->set_shard_cache_mutex(options_.shard_cache_mutex());
   re->Decref();
   return prog_ != NULL;
 }
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index 95044e9..8b82e0b 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -1574,36 +1574,4 @@
 BENCHMARK(PossibleMatchRange_Prefix);
 BENCHMARK(PossibleMatchRange_NoProg);
 
-static const RE2& GetShardCacheMutexRE(bool shard_cache_mutex) {
-  auto GetRE = [=]() {
-    RE2::Options options;
-    options.set_shard_cache_mutex(shard_cache_mutex);
-    return new RE2("foo*bar+qux?", options);
-  };
-  if (!shard_cache_mutex) {
-    static const auto* const re = GetRE();
-    return *re;
-  } else {
-    static const auto* const re = GetRE();
-    return *re;
-  }
-}
-
-void ShardCacheMutex_False(int n) {
-  const RE2& re = GetShardCacheMutexRE(false);
-  for (int i = 0; i < n; i++) {
-    RE2::PartialMatch("", re);
-  }
-}
-
-void ShardCacheMutex_True(int n) {
-  const RE2& re = GetShardCacheMutexRE(true);
-  for (int i = 0; i < n; i++) {
-    RE2::PartialMatch("", re);
-  }
-}
-
-BENCHMARK(ShardCacheMutex_False)->ThreadRange(1, NumCPUs());
-BENCHMARK(ShardCacheMutex_True)->ThreadRange(1, NumCPUs());
-
 }  // namespace re2
diff --git a/util/util.h b/util/util.h
index 4233956..33d100a 100644
--- a/util/util.h
+++ b/util/util.h
@@ -21,10 +21,6 @@
 #endif
 #endif
 
-#ifndef CACHELINE_SIZE
-#define CACHELINE_SIZE 64
-#endif
-
 #ifndef FALLTHROUGH_INTENDED
 #if defined(__clang__)
 #define FALLTHROUGH_INTENDED [[clang::fallthrough]]