| // Copyright 2009 The RE2 Authors. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include <stddef.h> |
| #include <algorithm> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| #include <utility> |
| |
| #include "util/test.h" |
| #include "util/logging.h" |
| #include "re2/filtered_re2.h" |
| #include "re2/re2.h" |
| |
| namespace re2 { |
| |
| struct FilterTestVars { |
| FilterTestVars() {} |
| explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} |
| |
| std::vector<std::string> atoms; |
| std::vector<int> atom_indices; |
| std::vector<int> matches; |
| RE2::Options opts; |
| FilteredRE2 f; |
| }; |
| |
| TEST(FilteredRE2Test, EmptyTest) { |
| FilterTestVars v; |
| |
| v.f.Compile(&v.atoms); |
| EXPECT_EQ(0, v.atoms.size()); |
| |
| // Compile has no effect at all when called before Add: it will not |
| // record that it has been called and it will not clear the vector. |
| // The second point does not matter here, but the first point means |
| // that an error will be logged during the call to AllMatches. |
| v.f.AllMatches("foo", v.atom_indices, &v.matches); |
| EXPECT_EQ(0, v.matches.size()); |
| } |
| |
| TEST(FilteredRE2Test, SmallOrTest) { |
| FilterTestVars v(4); // override the minimum atom length |
| int id; |
| v.f.Add("(foo|bar)", v.opts, &id); |
| |
| v.f.Compile(&v.atoms); |
| EXPECT_EQ(0, v.atoms.size()); |
| |
| v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); |
| EXPECT_EQ(1, v.matches.size()); |
| EXPECT_EQ(id, v.matches[0]); |
| } |
| |
| TEST(FilteredRE2Test, SmallLatinTest) { |
| FilterTestVars v; |
| int id; |
| |
| v.opts.set_encoding(RE2::Options::EncodingLatin1); |
| v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); |
| v.f.Compile(&v.atoms); |
| EXPECT_EQ(1, v.atoms.size()); |
| EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); |
| |
| v.atom_indices.push_back(0); |
| v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); |
| EXPECT_EQ(1, v.matches.size()); |
| EXPECT_EQ(id, v.matches[0]); |
| } |
| |
| struct AtomTest { |
| const char* testname; |
| // If any test needs more than this many regexps or atoms, increase |
| // the size of the corresponding array. |
| const char* regexps[20]; |
| const char* atoms[20]; |
| }; |
| |
| AtomTest atom_tests[] = { |
| { |
| // This test checks to make sure empty patterns are allowed. |
| "CheckEmptyPattern", |
| {""}, |
| {} |
| }, { |
| // This test checks that all atoms of length greater than min length |
| // are found, and no atoms that are of smaller length are found. |
| "AllAtomsGtMinLengthFound", { |
| "(abc123|def456|ghi789).*mnop[x-z]+", |
| "abc..yyy..zz", |
| "mnmnpp[a-z]+PPP" |
| }, { |
| "abc123", |
| "def456", |
| "ghi789", |
| "mnop", |
| "abc", |
| "yyy", |
| "mnmnpp", |
| "ppp" |
| } |
| }, { |
| // Test to make sure that any atoms that have another atom as a |
| // substring in an OR are removed; that is, only the shortest |
| // substring is kept. |
| "SubstrAtomRemovesSuperStrInOr", { |
| "(abc123|abc|ghi789|abc1234).*[x-z]+", |
| "abcd..yyy..yyyzzz", |
| "mnmnpp[a-z]+PPP" |
| }, { |
| "abc", |
| "ghi789", |
| "abcd", |
| "yyy", |
| "yyyzzz", |
| "mnmnpp", |
| "ppp" |
| } |
| }, { |
| // Test character class expansion. |
| "CharClassExpansion", { |
| "m[a-c][d-f]n.*[x-z]+", |
| "[x-y]bcde[ab]" |
| }, { |
| "madn", "maen", "mafn", |
| "mbdn", "mben", "mbfn", |
| "mcdn", "mcen", "mcfn", |
| "xbcdea", "xbcdeb", |
| "ybcdea", "ybcdeb" |
| } |
| }, { |
| // Test upper/lower of non-ASCII. |
| "UnicodeLower", { |
| "(?i)ΔδΠϖπΣςσ", |
| "ΛΜΝΟΠ", |
| "ψρστυ", |
| }, { |
| "δδπππσσσ", |
| "λμνοπ", |
| "ψρστυ", |
| }, |
| }, |
| }; |
| |
| void AddRegexpsAndCompile(const char* regexps[], |
| size_t n, |
| struct FilterTestVars* v) { |
| for (size_t i = 0; i < n; i++) { |
| int id; |
| v->f.Add(regexps[i], v->opts, &id); |
| } |
| v->f.Compile(&v->atoms); |
| } |
| |
| bool CheckExpectedAtoms(const char* atoms[], |
| size_t n, |
| const char* testname, |
| struct FilterTestVars* v) { |
| std::vector<std::string> expected; |
| for (size_t i = 0; i < n; i++) |
| expected.push_back(atoms[i]); |
| |
| bool pass = expected.size() == v->atoms.size(); |
| |
| std::sort(v->atoms.begin(), v->atoms.end()); |
| std::sort(expected.begin(), expected.end()); |
| for (size_t i = 0; pass && i < n; i++) |
| pass = pass && expected[i] == v->atoms[i]; |
| |
| if (!pass) { |
| LOG(ERROR) << "Failed " << testname; |
| LOG(ERROR) << "Expected #atoms = " << expected.size(); |
| for (size_t i = 0; i < expected.size(); i++) |
| LOG(ERROR) << expected[i]; |
| LOG(ERROR) << "Found #atoms = " << v->atoms.size(); |
| for (size_t i = 0; i < v->atoms.size(); i++) |
| LOG(ERROR) << v->atoms[i]; |
| } |
| |
| return pass; |
| } |
| |
| TEST(FilteredRE2Test, AtomTests) { |
| int nfail = 0; |
| for (size_t i = 0; i < arraysize(atom_tests); i++) { |
| FilterTestVars v; |
| AtomTest* t = &atom_tests[i]; |
| size_t nregexp, natom; |
| for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| if (t->regexps[nregexp] == NULL) |
| break; |
| for (natom = 0; natom < arraysize(t->atoms); natom++) |
| if (t->atoms[natom] == NULL) |
| break; |
| AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) |
| nfail++; |
| } |
| EXPECT_EQ(0, nfail); |
| } |
| |
| void FindAtomIndices(const std::vector<std::string>& atoms, |
| const std::vector<std::string>& matched_atoms, |
| std::vector<int>* atom_indices) { |
| atom_indices->clear(); |
| for (size_t i = 0; i < matched_atoms.size(); i++) { |
| for (size_t j = 0; j < atoms.size(); j++) { |
| if (matched_atoms[i] == atoms[j]) { |
| atom_indices->push_back(static_cast<int>(j)); |
| break; |
| } |
| } |
| } |
| } |
| |
| TEST(FilteredRE2Test, MatchEmptyPattern) { |
| FilterTestVars v; |
| AtomTest* t = &atom_tests[0]; |
| // We are using the regexps used in one of the atom tests |
| // for this test. Adding the EXPECT here to make sure |
| // the index we use for the test is for the correct test. |
| EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); |
| size_t nregexp; |
| for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| if (t->regexps[nregexp] == NULL) |
| break; |
| AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| std::string text = "0123"; |
| std::vector<int> atom_ids; |
| std::vector<int> matching_regexps; |
| EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); |
| } |
| |
| TEST(FilteredRE2Test, MatchTests) { |
| FilterTestVars v; |
| AtomTest* t = &atom_tests[2]; |
| // We are using the regexps used in one of the atom tests |
| // for this test. |
| EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); |
| size_t nregexp; |
| for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| if (t->regexps[nregexp] == NULL) |
| break; |
| AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| |
| std::string text = "abc121212xyz"; |
| // atoms = abc |
| std::vector<int> atom_ids; |
| std::vector<std::string> atoms; |
| atoms.push_back("abc"); |
| FindAtomIndices(v.atoms, atoms, &atom_ids); |
| std::vector<int> matching_regexps; |
| v.f.AllMatches(text, atom_ids, &matching_regexps); |
| EXPECT_EQ(1, matching_regexps.size()); |
| |
| text = "abc12312yyyzzz"; |
| atoms.clear(); |
| atoms.push_back("abc"); |
| atoms.push_back("yyy"); |
| atoms.push_back("yyyzzz"); |
| FindAtomIndices(v.atoms, atoms, &atom_ids); |
| v.f.AllMatches(text, atom_ids, &matching_regexps); |
| EXPECT_EQ(1, matching_regexps.size()); |
| |
| text = "abcd12yyy32yyyzzz"; |
| atoms.clear(); |
| atoms.push_back("abc"); |
| atoms.push_back("abcd"); |
| atoms.push_back("yyy"); |
| atoms.push_back("yyyzzz"); |
| FindAtomIndices(v.atoms, atoms, &atom_ids); |
| LOG(INFO) << "S: " << atom_ids.size(); |
| for (size_t i = 0; i < atom_ids.size(); i++) |
| LOG(INFO) << "i: " << i << " : " << atom_ids[i]; |
| v.f.AllMatches(text, atom_ids, &matching_regexps); |
| EXPECT_EQ(2, matching_regexps.size()); |
| } |
| |
| TEST(FilteredRE2Test, EmptyStringInStringSetBug) { |
| // Bug due to find() finding "" at the start of everything in a string |
| // set and thus SimplifyStringSet() would end up erasing everything. |
| // In order to test this, we have to keep PrefilterTree from discarding |
| // the OR entirely, so we have to make the minimum atom length zero. |
| |
| FilterTestVars v(0); // override the minimum atom length |
| const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; |
| const char* atoms[] = {"", "-r", "add=;aa", "}"}; |
| AddRegexpsAndCompile(regexps, arraysize(regexps), &v); |
| EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), |
| "EmptyStringInStringSetBug", &v)); |
| } |
| |
| TEST(FilteredRE2Test, MoveSemantics) { |
| FilterTestVars v1; |
| int id; |
| v1.f.Add("foo\\d+", v1.opts, &id); |
| EXPECT_EQ(0, id); |
| v1.f.Compile(&v1.atoms); |
| EXPECT_EQ(1, v1.atoms.size()); |
| EXPECT_EQ("foo", v1.atoms[0]); |
| v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); |
| EXPECT_EQ(1, v1.matches.size()); |
| EXPECT_EQ(0, v1.matches[0]); |
| v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); |
| EXPECT_EQ(0, v1.matches.size()); |
| |
| // The moved-to object should do what the moved-from object did. |
| FilterTestVars v2; |
| v2.f = std::move(v1.f); |
| v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); |
| EXPECT_EQ(1, v2.matches.size()); |
| EXPECT_EQ(0, v2.matches[0]); |
| v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); |
| EXPECT_EQ(0, v2.matches.size()); |
| |
| // The moved-from object should have been reset and be reusable. |
| v1.f.Add("bar\\d+", v1.opts, &id); |
| EXPECT_EQ(0, id); |
| v1.f.Compile(&v1.atoms); |
| EXPECT_EQ(1, v1.atoms.size()); |
| EXPECT_EQ("bar", v1.atoms[0]); |
| v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); |
| EXPECT_EQ(0, v1.matches.size()); |
| v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); |
| EXPECT_EQ(1, v1.matches.size()); |
| EXPECT_EQ(0, v1.matches[0]); |
| |
| // Verify that "overwriting" works and also doesn't leak memory. |
| // (The latter will need a leak detector such as LeakSanitizer.) |
| v1.f = std::move(v2.f); |
| v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); |
| EXPECT_EQ(1, v1.matches.size()); |
| EXPECT_EQ(0, v1.matches[0]); |
| v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); |
| EXPECT_EQ(0, v1.matches.size()); |
| } |
| |
| } // namespace re2 |