Use NUMA topology to shard the DFA state cache mutex.

Change-Id: I3df2b59a5c98a66de1198794f52463eda8b65608
Reviewed-on: https://code-review.googlesource.com/c/36330
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/Makefile b/Makefile
index f001f06..110ca73 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,11 @@
 # CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU
 # LDICU=$(shell pkg-config icu-uc --libs)
 
+# To use NUMA topology to shard the DFA state cache mutex,
+# uncomment the next two lines:
+# CCNUMA=-DRE2_USE_NUMA
+# LDNUMA=-lnuma
+
 # To build against PCRE for testing or benchmarking,
 # uncomment the next two lines:
 # CCPCRE=-I/usr/local/include -DUSEPCRE
@@ -17,8 +22,8 @@
 CXXFLAGS?=-O3 -g
 LDFLAGS?=
 # required
-RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE)
-RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE)
+RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCNUMA) $(CCPCRE)
+RE2_LDFLAGS?=-pthread $(LDICU) $(LDNUMA) $(LDPCRE)
 AR?=ar
 ARFLAGS?=rsc
 NM?=nm
@@ -287,7 +292,7 @@
 	@echo
 
 static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
+static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDNUMA) $(LDFLAGS)
 static-testinstall:
 	@mkdir -p obj
 	@cp testinstall.cc obj
@@ -301,7 +306,7 @@
 endif
 
 shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
+shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDNUMA) $(LDFLAGS)
 shared-testinstall:
 	@mkdir -p obj
 	@cp testinstall.cc obj
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 2df10b2..5c50b4a 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -45,6 +45,11 @@
 #include "re2/prog.h"
 #include "re2/stringpiece.h"
 
+#if defined(RE2_USE_NUMA)
+#include <numa.h>
+#include <sched.h>
+#endif
+
 // Silence "zero-sized array in struct/union" warning for DFA::State::next_.
 #ifdef _MSC_VER
 #pragma warning(disable: 4200)
@@ -338,6 +343,12 @@
     return prog_->bytemap()[c];
   }
 
+  // Determines how many ways to shard cache_mutex_.
+  int CacheMutexCount() const;
+
+  // Determines which shard of cache_mutex_ to use.
+  int WhichCacheMutex() const;
+
   // Constant after initialization.
   Prog* prog_;              // The regular expression program to run.
   Prog::MatchKind kind_;    // The kind of DFA.
@@ -459,7 +470,7 @@
     astack_(NULL),
     mem_budget_(max_mem),
     cache_mutex_storage_(NULL),
-    cache_mutex_count_(1) {
+    cache_mutex_count_(CacheMutexCount()) {
   if (ExtraDebug)
     fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
   int nmark = 0;
@@ -1157,7 +1168,8 @@
   RWLocker& operator=(const RWLocker&) = delete;
 };
 
-DFA::RWLocker::RWLocker(DFA* dfa) : dfa_(dfa), which_(0), writing_(false) {
+DFA::RWLocker::RWLocker(DFA* dfa)
+    : dfa_(dfa), which_(dfa_->WhichCacheMutex()), writing_(false) {
   dfa_->cache_mutex_[which_].ReaderLock();
 }
 
@@ -2171,4 +2183,55 @@
   return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
 }
 
+#if defined(RE2_USE_NUMA)
+static int numa_max_count;
+static std::vector<int>* numa_cpu_map;
+
+void InitNUMA() {
+  numa_max_count = 0;
+  numa_cpu_map = new std::vector<int>;
+  if (numa_available() == -1)
+    return;
+  int nodes = numa_num_configured_nodes();
+  if (nodes < 1)
+    return;
+  int cpus = numa_num_configured_cpus();
+  if (cpus < 1)
+    return;
+  std::vector<int> count;
+  count.resize(nodes);
+  for (int cpu = 0; cpu < cpus; cpu++) {
+    int node = numa_node_of_cpu(cpu);
+    numa_cpu_map->emplace_back(count[node]++);
+  }
+  numa_max_count = *std::max_element(count.begin(), count.end());
+}
+#endif
+
+int DFA::CacheMutexCount() const {
+#if !defined(RE2_USE_NUMA)
+  return 1;
+#else
+  if (!prog_->shard_cache_mutex())
+    return 1;
+  static std::once_flag numa_once;
+  std::call_once(numa_once, &InitNUMA);
+  if (numa_max_count == 0)
+    return 1;
+  return numa_max_count;
+#endif
+}
+
+int DFA::WhichCacheMutex() const {
+#if !defined(RE2_USE_NUMA)
+  return 0;
+#else
+  if (!prog_->shard_cache_mutex())
+    return 0;
+  if (numa_cpu_map->empty())
+    return 0;
+  return (*numa_cpu_map)[sched_getcpu()];
+#endif
+}
+
 }  // namespace re2