/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- * Copyright (c) 2024, gperftools Contributors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TRIVIALRE_H_ #define TRIVIALRE_H_ #include #include #include #include #include #include #include #include #include namespace trivialre { // Callback for Matcher. See below. using CB = std::function; // Matcher is a function that gets string and invokes given callback // with remaining text (i.e. suffix of parsed part) for each // successful parsing. We're able to express arbitrary trees of regexp // expressions with this simple abstraction. using Matcher = std::function; // MatchSubstring returns true iff there is substring of `str' that // matches given matcher. inline bool MatchSubstring(const Matcher& m, std::string_view str) { size_t sz = str.size(); CB succeed = [](std::string_view str, bool line_start) { return true; }; bool line_start = true; for (size_t i = 0; i <= sz; i++) { if (m(str, line_start, succeed)) { return true; } line_start = (str[0] == '\n'); str.remove_prefix(1); } return false; } Matcher CompileREOrDie(std::string_view str); // --- implementation --- namespace matchers { // MatcherBuilder is a collection of functions that combine Matchers // according to various kinds of regex structures (sequence, // alternatives, '*' etc). struct MatcherBuilder { using Matcher = trivialre::Matcher; // Returns Matcher that parses given literal. static Matcher Lit(std::string_view lit); // Returns Matcher that left then right. static Matcher Seq(Matcher left, Matcher right); // Returns Matcher that given parses given sequence of matchers // (folding from right for efficiency). static Matcher SeqMany(std::initializer_list list); // Returns Matcher that parses either left or right. static Matcher Alt(Matcher left, Matcher right); // Returns Matcher that matches 0 or more parsings of given nested // matcher. I.e. implements '*' operator of regexps. static Matcher Star(Matcher nested); static Matcher LineStart(); static Matcher LineEnd(); // Returns Matcher that parses one character iff pred(character) is // true. template static Matcher CharP(Predicate pred); // Dot matcher implements '.' operator of regexps. I.e. matches // exactly one non-newline character. static Matcher Dot() { return CharP([](char ch) { return ch != '\n'; }); } // Any matcher immediately suceeds consuming no text at all. static Matcher Any() { return [](std::string_view str, bool line_start, const CB& cb) { return cb(str, line_start); }; } }; inline Matcher MatcherBuilder::Lit(std::string_view lit) { return [=](std::string_view str, bool line_start, const CB& cb) -> bool { auto sz = lit.size(); if (str.substr(0, sz) != lit) { return false; } line_start = (sz == 0) ? line_start : (str[sz - 1] == '\n'); str.remove_prefix(sz); // printf("Matched prefix %.*s (rest: %.*s)\n", // static_cast(lit.size()), lit.data(), // std::max(str.size(), 6), str.data()); return cb(str, line_start); }; } inline Matcher MatcherBuilder::Seq(Matcher left, Matcher right) { return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool { return left(str, line_start, [=](std::string_view str, bool line_start) { return right(str, line_start, cb); }); }; } inline Matcher MatcherBuilder::SeqMany(std::initializer_list list) { if (std::empty(list)) { return Any(); } auto it = std::rbegin(list); Matcher rv = *it++; while (it != std::rend(list)) { rv = Seq(*it++, std::move(rv)); } return rv; } inline Matcher MatcherBuilder::Alt(Matcher left, Matcher right) { return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool { if (left(str, line_start, cb)) { return true; } return right(str, line_start, cb); }; } inline Matcher MatcherBuilder::Star(Matcher nested) { return [nested = std::move(nested)](std::string_view str, bool line_start, const CB& cb) -> bool { CB rec; rec = [&](std::string_view str, bool line_start) -> bool { if (cb(str, line_start)) { return true; } return nested(str, line_start, rec); }; return rec(str, line_start); }; } template Matcher MatcherBuilder::CharP(Predicate pred) { return [pred = std::move(pred)](std::string_view str, bool line_start, const CB& cb) -> bool { if (str.size() && pred(str[0])) { bool line_start = (str[0] == '\n'); str.remove_prefix(1); return cb(str, line_start); } return false; }; } inline Matcher MatcherBuilder::LineStart() { return [](std::string_view str, bool line_start, const CB& cb) { if (!line_start) return false; return cb(str, line_start); }; } inline Matcher MatcherBuilder::LineEnd() { return [](std::string_view str, bool line_start, const CB& cb) { if (str.size() && str[0] != '\n') { return false; } // Yes, line-end doesn't consume the \n character. return cb(str, line_start); }; } } // namespace matchers namespace re_compiler { struct ErrorPolicy { std::string_view original_str; void NoteError(std::string_view msg, std::string_view at) { // For our trivial implementation we're only able to crash fprintf(stderr, "parse error %.*s, at: %.*s\n", int(msg.size()), msg.data(), int(at.size()), at.data()); fprintf(stderr, "expression we were parsing:\n%.*s\n", int(original_str.size()), original_str.data()); if (size_t diff = at.data() - original_str.data(); diff < 120) { fprintf(stderr, "%s^\n", std::string{}.append(diff, '-').c_str()); } fflush(stderr); abort(); } void StartedParsing(std::string_view str) { original_str = str; } }; // C is our regexp compiler. It assembles matcher tree from string // regexp representation. Given builder is used to construct concrete // matchers, allowing flexibility (see StringTestingBuilder). template struct C : public ErrorPolicy { using Matcher = typename Builder::Matcher; // ParseResult is Matcher (or nothing if we parsed empty string) and // remaining text. using ParseResult = std::pair, std::string_view>; const Builder& builder; explicit C(const Builder& builder) : builder(builder) {} bool IsCharAt(std::string_view str, size_t index, char ch) { return index < str.size() && str[index] == ch; } // This is top level parser. It parses alternatives of regex runs. ParseResult ParseAlt(std::string_view str) { auto [maybe_left, str_l] = ParseRun(str); if (IsCharAt(str_l, 0, '|')) { if (!maybe_left) { maybe_left.emplace(builder.Any()); } auto [maybe_right, str_r] = ParseAlt(str_l.substr(1)); if (!maybe_right) { maybe_right.emplace(builder.Any()); } return {builder.Alt(std::move(maybe_left.value()), std::move(maybe_right.value())), str_r}; } return {std::move(maybe_left), str_l}; } using FnPred = std::function; template void AddPred(FnPred* pred, Body body) { if (!*pred) { *pred = body; } else { *pred = [old = std::move(*pred), body = std::move(body)](char ch) { return old(ch) || body(ch); }; } } // Parses [] expression. Note: str is just past // opening '[' character) ParseResult CompileCharSet(std::string_view str) { bool negated = false; if (IsCharAt(str, 0, '^')) { negated = true; str.remove_prefix(1); } FnPred pred; while (str.size() > 0 && str[0] != ']') { if (str.size() > 2 && str[1] == '-' && str[2] != ']') { // range AddPred(&pred, [a = str[0], b = str[2]](char ch) { return a <= ch && ch <= b; }); str.remove_prefix(3); continue; } char ch = str[0]; if (ch == '\\') { if (str.size() == 1) { break; } str.remove_prefix(1); ch = str[0]; } AddPred(&pred, [ch](char candidate) { return ch == candidate; }); str.remove_prefix(1); } if (!IsCharAt(str, 0, ']')) { ErrorPolicy::NoteError("failed to spot ] at the end of char-set term", str); return {{}, ""}; } if (!pred) { pred = [negated](char candidate) { return negated; }; } else if (negated) { pred = [pred = std::move(pred)](char candidate) { return !pred(candidate); }; } return {builder.CharP(std::move(pred)), str.substr(1)}; } // Parses sequence of literals and groups and groups of '*' and '+' // expressions. ParseResult ParseRun(std::string_view str) { if (str.size() == 0) { return {{}, str}; } static constexpr char kSpecials[] = "()[]{}.*|\\?+^$"; static constexpr const char* kSpecialsEnd = kSpecials + sizeof(kSpecials) - 1; size_t i; for (i = 0; i < str.size(); i++) { char ch = str[i]; if (std::find(kSpecials, kSpecialsEnd, ch) != kSpecialsEnd) { break; } } if (i) { // we got literal if (i > 1 && (IsCharAt(str, i, '*') || IsCharAt(str, i, '+') || IsCharAt(str, i, '?'))) { // only last char of literal char runs will be '*'-ed. So lets // be careful i--; } // we got literal. Lets try to concat it with possible '*' and next run return MaybeStar(builder.Lit(str.substr(0, i)), str.substr(i)); } char first = str[0]; if (first == '\\' && str.size() > 1) { std::string_view literal; if (str[1] == 'n') { literal = "\n"; } else if (str[1] == 't') { literal = "\t"; } else if (str[1] == ' ') { literal = " "; } else if (auto place = std::find(kSpecials, kSpecialsEnd, str[1]); place != kSpecialsEnd) { literal = {place, 1}; } else { // Failure to parse return {{}, str}; } return MaybeStar(builder.Lit(literal), str.substr(2)); } if (first == '^') { return MaybeStar(builder.LineStart(), str.substr(1)); } if (first == '$') { return MaybeStar(builder.LineEnd(), str.substr(1)); } if (first == '.') { return MaybeStar(builder.Dot(), str.substr(1)); } if (first == '[') { return CompileCharSet(str.substr(1)); } if (first == '(') { auto [maybe_nested, new_str] = ParseAlt(str.substr(1)); if (!IsCharAt(new_str, 0, ')')) { ErrorPolicy::NoteError("failed to spot ) at the end of group term", new_str); return {{}, ""}; } if (maybe_nested) { return MaybeStar(std::move(maybe_nested.value()), new_str.substr(1)); } // empty group. We just ignore it. But lets also handle possible // '*' after it (which we also eat) if (IsCharAt(new_str, 1, '*')) { new_str.remove_prefix(1); } return ParseRun(new_str.substr(1)); } // Likely '|', ')' or parse error return {{}, str}; } // Sequences left then right or just left if right is missing). Matcher MaybeSeq(Matcher left, std::optional right) { if (right) { return builder.Seq(std::move(left), std::move(right.value())); } return left; } // Builds matcher for '+' expression. Matcher MakePlus(Matcher nested) { return builder.Seq(nested, builder.Star(nested)); } // Given regex matcher, check if it is followed by '*' or '+' and // wrap it if needed, then continue gathering sequence of matches // (see ParseRun) ParseResult MaybeStar(Matcher left, std::string_view str) { if (IsCharAt(str, 0, '*')) { left = builder.Star(std::move(left)); str.remove_prefix(1); if (IsCharAt(str, 0, '?')) { // We don't produce actual matching, so there is not // difference between lazy and eager matching. But lets // support the syntax anyways, by ignoring lazyness marker str.remove_prefix(1); } } if (IsCharAt(str, 0, '+')) { left = MakePlus(std::move(left)); str.remove_prefix(1); if (IsCharAt(str, 0, '?')) { // We don't produce actual matching, so there is not // difference between lazy and eager matching. But lets // support the syntax anyways, by ignoring lazyness marker str.remove_prefix(1); } } if (IsCharAt(str, 0, '?')) { left = builder.Alt(builder.Any(), std::move(left)); str.remove_prefix(1); } auto [maybe_right, new_str] = ParseRun(str); return {MaybeSeq(left, std::move(maybe_right)), new_str}; } Matcher CompileOrDie(std::string_view str) { ErrorPolicy::StartedParsing(str); auto [maybe_m, new_str] = ParseAlt(str); if (!new_str.empty()) { ErrorPolicy::NoteError("failed to parse entire re string", new_str); } if (!maybe_m) { return builder.Any(); } return maybe_m.value(); } }; } // namespace re_compiler inline Matcher CompileREOrDie(std::string_view str) { return re_compiler::C({}).CompileOrDie(str); } } // namespace trivialre #endif // TRIVIALRE_H_