Implement "front and back" prefix accel.

Change-Id: Ib585680e7b8ff1463db4cea55539e6fb52eed1d4
Reviewed-on: https://code-review.googlesource.com/c/re2/+/56657
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prog.cc b/re2/prog.cc
index 06d58c8..fdc8022 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -7,6 +7,9 @@
 
 #include "re2/prog.h"
 
+#if defined(__GNUC__) && defined(__AVX2__)
+#include <immintrin.h>
+#endif
 #include <stdint.h>
 #include <string.h>
 #include <algorithm>
@@ -910,4 +913,49 @@
   }
 }
 
+const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
+  DCHECK_GE(prefix_size_, 2);
+  if (size < prefix_size_)
+    return NULL;
+  // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
+  // This also means that probing for prefix_back_ doesn't go out of bounds.
+  size -= prefix_size_-1;
+
+#if defined(__GNUC__) && defined(__AVX2__)
+  // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
+  if (size >= sizeof(__m256i)) {
+    const __m256i* fp = reinterpret_cast<const __m256i*>(
+        reinterpret_cast<const char*>(data));
+    const __m256i* bp = reinterpret_cast<const __m256i*>(
+        reinterpret_cast<const char*>(data) + prefix_size_-1);
+    const __m256i* endfp = fp + size/sizeof(__m256i);
+    const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
+    const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
+    while (fp != endfp) {
+      const __m256i f_loadu = _mm256_loadu_si256(fp++);
+      const __m256i b_loadu = _mm256_loadu_si256(bp++);
+      const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
+      const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
+      const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
+      if (fb_testz == 0) {  // ZF: 1 means zero, 0 means non-zero.
+        const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
+        const int fb_movemask = _mm256_movemask_epi8(fb_and);
+        const int fb_ctz = __builtin_ctz(fb_movemask);
+        return reinterpret_cast<const char*>(fp-1) + fb_ctz;
+      }
+    }
+    data = fp;
+    size = size%sizeof(__m256i);
+  }
+#endif
+
+  const char* p0 = reinterpret_cast<const char*>(data);
+  for (const char* p = p0;; p++) {
+    DCHECK_GE(size, static_cast<size_t>(p-p0));
+    p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
+    if (p == NULL || p[prefix_size_-1] == prefix_back_)
+      return p;
+  }
+}
+
 }  // namespace re2
diff --git a/re2/prog.h b/re2/prog.h
index e54208c..e9ce682 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -220,10 +220,15 @@
   // Accelerates to the first likely occurrence of the prefix.
   // Returns a pointer to the first byte or NULL if not found.
   const void* PrefixAccel(const void* data, size_t size) {
-    DCHECK_NE(prefix_size_, 0);
-    return memchr(data, prefix_front_, size);
+    DCHECK_GE(prefix_size_, 1);
+    return prefix_size_ == 1 ? memchr(data, prefix_front_, size)
+                             : PrefixAccel_FrontAndBack(data, size);
   }
 
+  // An implementation of prefix accel that looks for prefix_front_ and
+  // prefix_back_ to return fewer false positives than memchr(3) alone.
+  const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
+
   // Returns string representation of program for debugging.
   std::string Dump();
   std::string DumpUnanchored();