gperftools/benchmark/trivialre.h

452 lines
15 KiB
C++

/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
* Copyright (c) 2024, gperftools Contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef TRIVIALRE_H_
#define TRIVIALRE_H_
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <functional>
#include <initializer_list>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
namespace trivialre {
// Callback for Matcher. See below.
using CB = std::function<bool(std::string_view, bool)>;
// Matcher is a function that gets string and invokes given callback
// with remaining text (i.e. suffix of parsed part) for each
// successful parsing. We're able to express arbitrary trees of regexp
// expressions with this simple abstraction.
using Matcher = std::function<bool(std::string_view str, bool line_start, const CB& cb)>;
// MatchSubstring returns true iff there is substring of `str' that
// matches given matcher.
inline bool MatchSubstring(const Matcher& m, std::string_view str) {
size_t sz = str.size();
CB succeed = [](std::string_view str, bool line_start) { return true; };
bool line_start = true;
for (size_t i = 0; i < sz; i++) {
if (m(str, line_start, succeed)) {
return true;
}
line_start = (str[0] == '\n');
str.remove_prefix(1);
}
return m("", line_start, succeed);
}
Matcher CompileREOrDie(std::string_view str);
// --- implementation ---
namespace matchers {
// MatcherBuilder is a collection of functions that combine Matchers
// according to various kinds of regex structures (sequence,
// alternatives, '*' etc).
struct MatcherBuilder {
using Matcher = trivialre::Matcher;
// Returns Matcher that parses given literal.
static Matcher Lit(std::string_view lit);
// Returns Matcher that left then right.
static Matcher Seq(Matcher left, Matcher right);
// Returns Matcher that given parses given sequence of matchers
// (folding from right for efficiency).
static Matcher SeqMany(std::initializer_list<Matcher> list);
// Returns Matcher that parses either left or right.
static Matcher Alt(Matcher left, Matcher right);
// Returns Matcher that matches 0 or more parsings of given nested
// matcher. I.e. implements '*' operator of regexps.
static Matcher Star(Matcher nested);
static Matcher LineStart();
static Matcher LineEnd();
// Returns Matcher that parses one character iff pred(character) is
// true.
template <typename Predicate>
static Matcher CharP(Predicate pred);
// Dot matcher implements '.' operator of regexps. I.e. matches
// exactly one non-newline character.
static Matcher Dot() {
return CharP([](char ch) { return ch != '\n'; });
}
// Any matcher immediately suceeds consuming no text at all.
static Matcher Any() {
return [](std::string_view str, bool line_start, const CB& cb) { return cb(str, line_start); };
}
};
inline Matcher MatcherBuilder::Lit(std::string_view lit) {
return [=](std::string_view str, bool line_start, const CB& cb) -> bool {
auto sz = lit.size();
if (str.substr(0, sz) != lit) {
return false;
}
line_start = (sz == 0) ? line_start : (str[sz - 1] == '\n');
str.remove_prefix(sz);
// printf("Matched prefix %.*s (rest: %.*s)\n",
// static_cast<int>(lit.size()), lit.data(),
// std::max<int>(str.size(), 6), str.data());
return cb(str, line_start);
};
}
inline Matcher MatcherBuilder::Seq(Matcher left, Matcher right) {
return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool {
return left(str, line_start, [=](std::string_view str, bool line_start) { return right(str, line_start, cb); });
};
}
inline Matcher MatcherBuilder::SeqMany(std::initializer_list<Matcher> list) {
if (std::empty(list)) {
return Any();
}
auto it = std::rbegin(list);
Matcher rv = *it++;
while (it != std::rend(list)) {
rv = Seq(*it++, std::move(rv));
}
return rv;
}
inline Matcher MatcherBuilder::Alt(Matcher left, Matcher right) {
return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool {
if (left(str, line_start, cb)) {
return true;
}
return right(str, line_start, cb);
};
}
inline Matcher MatcherBuilder::Star(Matcher nested) {
return [nested = std::move(nested)](std::string_view str, bool line_start, const CB& cb) -> bool {
CB rec;
rec = [&](std::string_view str, bool line_start) -> bool {
if (cb(str, line_start)) {
return true;
}
return nested(str, line_start, rec);
};
return rec(str, line_start);
};
}
template <typename Predicate>
Matcher MatcherBuilder::CharP(Predicate pred) {
return [pred = std::move(pred)](std::string_view str, bool line_start, const CB& cb) -> bool {
if (str.size() && pred(str[0])) {
bool line_start = (str[0] == '\n');
str.remove_prefix(1);
return cb(str, line_start);
}
return false;
};
}
inline Matcher MatcherBuilder::LineStart() {
return [](std::string_view str, bool line_start, const CB& cb) {
if (!line_start) return false;
return cb(str, line_start);
};
}
inline Matcher MatcherBuilder::LineEnd() {
return [](std::string_view str, bool line_start, const CB& cb) {
if (str.size() && str[0] != '\n') {
return false;
}
// Yes, line-end doesn't consume the \n character.
return cb(str, line_start);
};
}
} // namespace matchers
namespace re_compiler {
struct ErrorPolicy {
std::string_view original_str;
void NoteError(std::string_view msg, std::string_view at) {
// For our trivial implementation we're only able to crash
fprintf(stderr, "parse error %.*s, at: %.*s\n", int(msg.size()), msg.data(), int(at.size()), at.data());
fprintf(stderr, "expression we were parsing:\n%.*s\n", int(original_str.size()), original_str.data());
if (size_t diff = at.data() - original_str.data(); diff < 120) {
fprintf(stderr, "%s^\n", std::string{}.append(diff, '-').c_str());
}
fflush(stderr);
abort();
}
void StartedParsing(std::string_view str) { original_str = str; }
};
// C is our regexp compiler. It assembles matcher tree from string
// regexp representation. Given builder is used to construct concrete
// matchers, allowing flexibility (see StringTestingBuilder).
template <typename Builder, typename ErrorPolicy = re_compiler::ErrorPolicy>
struct C : public ErrorPolicy {
using Matcher = typename Builder::Matcher;
// ParseResult is Matcher (or nothing if we parsed empty string) and
// remaining text.
using ParseResult = std::pair<std::optional<Matcher>, std::string_view>;
const Builder& builder;
explicit C(const Builder& builder) : builder(builder) {}
bool IsCharAt(std::string_view str, size_t index, char ch) { return index < str.size() && str[index] == ch; }
// This is top level parser. It parses alternatives of regex runs.
ParseResult ParseAlt(std::string_view str) {
auto [maybe_left, str_l] = ParseRun(str);
if (IsCharAt(str_l, 0, '|')) {
if (!maybe_left) {
maybe_left.emplace(builder.Any());
}
auto [maybe_right, str_r] = ParseAlt(str_l.substr(1));
if (!maybe_right) {
maybe_right.emplace(builder.Any());
}
return {builder.Alt(std::move(maybe_left.value()), std::move(maybe_right.value())), str_r};
}
return {std::move(maybe_left), str_l};
}
using FnPred = std::function<bool(char)>;
template <typename Body>
void AddPred(FnPred* pred, Body body) {
if (!*pred) {
*pred = body;
} else {
*pred = [old = std::move(*pred), body = std::move(body)](char ch) { return old(ch) || body(ch); };
}
}
// Parses [<set-of-chars>] expression. Note: str is just past
// opening '[' character)
ParseResult CompileCharSet(std::string_view str) {
bool negated = false;
if (IsCharAt(str, 0, '^')) {
negated = true;
str.remove_prefix(1);
}
FnPred pred;
while (str.size() > 0 && str[0] != ']') {
if (str.size() > 2 && str[1] == '-' && str[2] != ']') {
// range
AddPred(&pred, [a = str[0], b = str[2]](char ch) { return a <= ch && ch <= b; });
str.remove_prefix(3);
continue;
}
char ch = str[0];
if (ch == '\\') {
if (str.size() == 1) {
break;
}
str.remove_prefix(1);
ch = str[0];
}
AddPred(&pred, [ch](char candidate) { return ch == candidate; });
str.remove_prefix(1);
}
if (!IsCharAt(str, 0, ']')) {
ErrorPolicy::NoteError("failed to spot ] at the end of char-set term", str);
return {{}, ""};
}
if (!pred) {
pred = [negated](char candidate) { return negated; };
} else if (negated) {
pred = [pred = std::move(pred)](char candidate) { return !pred(candidate); };
}
return {builder.CharP(std::move(pred)), str.substr(1)};
}
// Parses sequence of literals and groups and groups of '*' and '+'
// expressions.
ParseResult ParseRun(std::string_view str) {
if (str.size() == 0) {
return {{}, str};
}
static constexpr char kSpecials[] = "()[]{}.*|\\?+^$";
static constexpr const char* kSpecialsEnd = kSpecials + sizeof(kSpecials) - 1;
size_t i;
for (i = 0; i < str.size(); i++) {
char ch = str[i];
if (std::find(kSpecials, kSpecialsEnd, ch) != kSpecialsEnd) {
break;
}
}
if (i) {
// we got literal
if (i > 1 && (IsCharAt(str, i, '*') || IsCharAt(str, i, '+') || IsCharAt(str, i, '?'))) {
// only last char of literal char runs will be '*'-ed. So lets
// be careful
i--;
}
// we got literal. Lets try to concat it with possible '*' and next run
return MaybeStar(builder.Lit(str.substr(0, i)), str.substr(i));
}
char first = str[0];
if (first == '\\' && str.size() > 1) {
std::string_view literal;
if (str[1] == 'n') {
literal = "\n";
} else if (str[1] == 't') {
literal = "\t";
} else if (str[1] == ' ') {
literal = " ";
} else if (auto place = std::find(kSpecials, kSpecialsEnd, str[1]); place != kSpecialsEnd) {
literal = {place, 1};
} else {
// Failure to parse
return {{}, str};
}
return MaybeStar(builder.Lit(literal), str.substr(2));
}
if (first == '^') {
return MaybeStar(builder.LineStart(), str.substr(1));
}
if (first == '$') {
return MaybeStar(builder.LineEnd(), str.substr(1));
}
if (first == '.') {
return MaybeStar(builder.Dot(), str.substr(1));
}
if (first == '[') {
return CompileCharSet(str.substr(1));
}
if (first == '(') {
auto [maybe_nested, new_str] = ParseAlt(str.substr(1));
if (!IsCharAt(new_str, 0, ')')) {
ErrorPolicy::NoteError("failed to spot ) at the end of group term", new_str);
return {{}, ""};
}
if (maybe_nested) {
return MaybeStar(std::move(maybe_nested.value()), new_str.substr(1));
}
// empty group. We just ignore it. But lets also handle possible
// '*' after it (which we also eat)
if (IsCharAt(new_str, 1, '*')) {
new_str.remove_prefix(1);
}
return ParseRun(new_str.substr(1));
}
// Likely '|', ')' or parse error
return {{}, str};
}
// Sequences left then right or just left if right is missing).
Matcher MaybeSeq(Matcher left, std::optional<Matcher> right) {
if (right) {
return builder.Seq(std::move(left), std::move(right.value()));
}
return left;
}
// Builds matcher for '+' expression.
Matcher MakePlus(Matcher nested) { return builder.Seq(nested, builder.Star(nested)); }
// Given regex matcher, check if it is followed by '*' or '+' and
// wrap it if needed, then continue gathering sequence of matches
// (see ParseRun)
ParseResult MaybeStar(Matcher left, std::string_view str) {
if (IsCharAt(str, 0, '*')) {
left = builder.Star(std::move(left));
str.remove_prefix(1);
if (IsCharAt(str, 0, '?')) {
// We don't produce actual matching, so there is not
// difference between lazy and eager matching. But lets
// support the syntax anyways, by ignoring lazyness marker
str.remove_prefix(1);
}
}
if (IsCharAt(str, 0, '+')) {
left = MakePlus(std::move(left));
str.remove_prefix(1);
if (IsCharAt(str, 0, '?')) {
// We don't produce actual matching, so there is not
// difference between lazy and eager matching. But lets
// support the syntax anyways, by ignoring lazyness marker
str.remove_prefix(1);
}
}
if (IsCharAt(str, 0, '?')) {
left = builder.Alt(builder.Any(), std::move(left));
str.remove_prefix(1);
}
auto [maybe_right, new_str] = ParseRun(str);
return {MaybeSeq(left, std::move(maybe_right)), new_str};
}
Matcher CompileOrDie(std::string_view str) {
ErrorPolicy::StartedParsing(str);
auto [maybe_m, new_str] = ParseAlt(str);
if (!new_str.empty()) {
ErrorPolicy::NoteError("failed to parse entire re string", new_str);
}
if (!maybe_m) {
return builder.Any();
}
return maybe_m.value();
}
};
} // namespace re_compiler
inline Matcher CompileREOrDie(std::string_view str) { return re_compiler::C<matchers::MatcherBuilder>({}).CompileOrDie(str); }
} // namespace trivialre
#endif // TRIVIALRE_H_