452 lines
15 KiB
C++
452 lines
15 KiB
C++
/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
|
|
* Copyright (c) 2024, gperftools Contributors
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are
|
|
* met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following disclaimer
|
|
* in the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of Google Inc. nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
#ifndef TRIVIALRE_H_
|
|
#define TRIVIALRE_H_
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <initializer_list>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <utility>
|
|
|
|
namespace trivialre {
|
|
|
|
// Callback for Matcher. See below.
|
|
using CB = std::function<bool(std::string_view, bool)>;
|
|
// Matcher is a function that gets string and invokes given callback
|
|
// with remaining text (i.e. suffix of parsed part) for each
|
|
// successful parsing. We're able to express arbitrary trees of regexp
|
|
// expressions with this simple abstraction.
|
|
using Matcher = std::function<bool(std::string_view str, bool line_start, const CB& cb)>;
|
|
|
|
// MatchSubstring returns true iff there is substring of `str' that
|
|
// matches given matcher.
|
|
inline bool MatchSubstring(const Matcher& m, std::string_view str) {
|
|
size_t sz = str.size();
|
|
CB succeed = [](std::string_view str, bool line_start) { return true; };
|
|
bool line_start = true;
|
|
for (size_t i = 0; i < sz; i++) {
|
|
if (m(str, line_start, succeed)) {
|
|
return true;
|
|
}
|
|
line_start = (str[0] == '\n');
|
|
str.remove_prefix(1);
|
|
}
|
|
return m("", line_start, succeed);
|
|
}
|
|
|
|
Matcher CompileREOrDie(std::string_view str);
|
|
|
|
// --- implementation ---
|
|
|
|
namespace matchers {
|
|
|
|
// MatcherBuilder is a collection of functions that combine Matchers
|
|
// according to various kinds of regex structures (sequence,
|
|
// alternatives, '*' etc).
|
|
struct MatcherBuilder {
|
|
using Matcher = trivialre::Matcher;
|
|
|
|
// Returns Matcher that parses given literal.
|
|
static Matcher Lit(std::string_view lit);
|
|
// Returns Matcher that left then right.
|
|
static Matcher Seq(Matcher left, Matcher right);
|
|
// Returns Matcher that given parses given sequence of matchers
|
|
// (folding from right for efficiency).
|
|
static Matcher SeqMany(std::initializer_list<Matcher> list);
|
|
// Returns Matcher that parses either left or right.
|
|
static Matcher Alt(Matcher left, Matcher right);
|
|
// Returns Matcher that matches 0 or more parsings of given nested
|
|
// matcher. I.e. implements '*' operator of regexps.
|
|
static Matcher Star(Matcher nested);
|
|
static Matcher LineStart();
|
|
static Matcher LineEnd();
|
|
|
|
// Returns Matcher that parses one character iff pred(character) is
|
|
// true.
|
|
template <typename Predicate>
|
|
static Matcher CharP(Predicate pred);
|
|
|
|
// Dot matcher implements '.' operator of regexps. I.e. matches
|
|
// exactly one non-newline character.
|
|
static Matcher Dot() {
|
|
return CharP([](char ch) { return ch != '\n'; });
|
|
}
|
|
// Any matcher immediately suceeds consuming no text at all.
|
|
static Matcher Any() {
|
|
return [](std::string_view str, bool line_start, const CB& cb) { return cb(str, line_start); };
|
|
}
|
|
};
|
|
|
|
inline Matcher MatcherBuilder::Lit(std::string_view lit) {
|
|
return [=](std::string_view str, bool line_start, const CB& cb) -> bool {
|
|
auto sz = lit.size();
|
|
if (str.substr(0, sz) != lit) {
|
|
return false;
|
|
}
|
|
line_start = (sz == 0) ? line_start : (str[sz - 1] == '\n');
|
|
str.remove_prefix(sz);
|
|
// printf("Matched prefix %.*s (rest: %.*s)\n",
|
|
// static_cast<int>(lit.size()), lit.data(),
|
|
// std::max<int>(str.size(), 6), str.data());
|
|
return cb(str, line_start);
|
|
};
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::Seq(Matcher left, Matcher right) {
|
|
return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool {
|
|
return left(str, line_start, [=](std::string_view str, bool line_start) { return right(str, line_start, cb); });
|
|
};
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::SeqMany(std::initializer_list<Matcher> list) {
|
|
if (std::empty(list)) {
|
|
return Any();
|
|
}
|
|
auto it = std::rbegin(list);
|
|
Matcher rv = *it++;
|
|
while (it != std::rend(list)) {
|
|
rv = Seq(*it++, std::move(rv));
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::Alt(Matcher left, Matcher right) {
|
|
return [left = std::move(left), right = std::move(right)](std::string_view str, bool line_start, const CB& cb) -> bool {
|
|
if (left(str, line_start, cb)) {
|
|
return true;
|
|
}
|
|
return right(str, line_start, cb);
|
|
};
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::Star(Matcher nested) {
|
|
return [nested = std::move(nested)](std::string_view str, bool line_start, const CB& cb) -> bool {
|
|
CB rec;
|
|
rec = [&](std::string_view str, bool line_start) -> bool {
|
|
if (cb(str, line_start)) {
|
|
return true;
|
|
}
|
|
return nested(str, line_start, rec);
|
|
};
|
|
return rec(str, line_start);
|
|
};
|
|
}
|
|
|
|
template <typename Predicate>
|
|
Matcher MatcherBuilder::CharP(Predicate pred) {
|
|
return [pred = std::move(pred)](std::string_view str, bool line_start, const CB& cb) -> bool {
|
|
if (str.size() && pred(str[0])) {
|
|
bool line_start = (str[0] == '\n');
|
|
str.remove_prefix(1);
|
|
return cb(str, line_start);
|
|
}
|
|
return false;
|
|
};
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::LineStart() {
|
|
return [](std::string_view str, bool line_start, const CB& cb) {
|
|
if (!line_start) return false;
|
|
return cb(str, line_start);
|
|
};
|
|
}
|
|
|
|
inline Matcher MatcherBuilder::LineEnd() {
|
|
return [](std::string_view str, bool line_start, const CB& cb) {
|
|
if (str.size() && str[0] != '\n') {
|
|
return false;
|
|
}
|
|
// Yes, line-end doesn't consume the \n character.
|
|
return cb(str, line_start);
|
|
};
|
|
}
|
|
|
|
} // namespace matchers
|
|
|
|
namespace re_compiler {
|
|
|
|
struct ErrorPolicy {
|
|
std::string_view original_str;
|
|
|
|
void NoteError(std::string_view msg, std::string_view at) {
|
|
// For our trivial implementation we're only able to crash
|
|
fprintf(stderr, "parse error %.*s, at: %.*s\n", int(msg.size()), msg.data(), int(at.size()), at.data());
|
|
fprintf(stderr, "expression we were parsing:\n%.*s\n", int(original_str.size()), original_str.data());
|
|
if (size_t diff = at.data() - original_str.data(); diff < 120) {
|
|
fprintf(stderr, "%s^\n", std::string{}.append(diff, '-').c_str());
|
|
}
|
|
fflush(stderr);
|
|
abort();
|
|
}
|
|
|
|
void StartedParsing(std::string_view str) { original_str = str; }
|
|
};
|
|
|
|
// C is our regexp compiler. It assembles matcher tree from string
|
|
// regexp representation. Given builder is used to construct concrete
|
|
// matchers, allowing flexibility (see StringTestingBuilder).
|
|
template <typename Builder, typename ErrorPolicy = re_compiler::ErrorPolicy>
|
|
struct C : public ErrorPolicy {
|
|
using Matcher = typename Builder::Matcher;
|
|
// ParseResult is Matcher (or nothing if we parsed empty string) and
|
|
// remaining text.
|
|
using ParseResult = std::pair<std::optional<Matcher>, std::string_view>;
|
|
|
|
const Builder& builder;
|
|
explicit C(const Builder& builder) : builder(builder) {}
|
|
|
|
bool IsCharAt(std::string_view str, size_t index, char ch) { return index < str.size() && str[index] == ch; }
|
|
|
|
// This is top level parser. It parses alternatives of regex runs.
|
|
ParseResult ParseAlt(std::string_view str) {
|
|
auto [maybe_left, str_l] = ParseRun(str);
|
|
if (IsCharAt(str_l, 0, '|')) {
|
|
if (!maybe_left) {
|
|
maybe_left.emplace(builder.Any());
|
|
}
|
|
auto [maybe_right, str_r] = ParseAlt(str_l.substr(1));
|
|
if (!maybe_right) {
|
|
maybe_right.emplace(builder.Any());
|
|
}
|
|
return {builder.Alt(std::move(maybe_left.value()), std::move(maybe_right.value())), str_r};
|
|
}
|
|
return {std::move(maybe_left), str_l};
|
|
}
|
|
|
|
using FnPred = std::function<bool(char)>;
|
|
template <typename Body>
|
|
void AddPred(FnPred* pred, Body body) {
|
|
if (!*pred) {
|
|
*pred = body;
|
|
} else {
|
|
*pred = [old = std::move(*pred), body = std::move(body)](char ch) { return old(ch) || body(ch); };
|
|
}
|
|
}
|
|
|
|
// Parses [<set-of-chars>] expression. Note: str is just past
|
|
// opening '[' character)
|
|
ParseResult CompileCharSet(std::string_view str) {
|
|
bool negated = false;
|
|
if (IsCharAt(str, 0, '^')) {
|
|
negated = true;
|
|
str.remove_prefix(1);
|
|
}
|
|
FnPred pred;
|
|
|
|
while (str.size() > 0 && str[0] != ']') {
|
|
if (str.size() > 2 && str[1] == '-' && str[2] != ']') {
|
|
// range
|
|
AddPred(&pred, [a = str[0], b = str[2]](char ch) { return a <= ch && ch <= b; });
|
|
str.remove_prefix(3);
|
|
continue;
|
|
}
|
|
|
|
char ch = str[0];
|
|
|
|
if (ch == '\\') {
|
|
if (str.size() == 1) {
|
|
break;
|
|
}
|
|
str.remove_prefix(1);
|
|
ch = str[0];
|
|
}
|
|
|
|
AddPred(&pred, [ch](char candidate) { return ch == candidate; });
|
|
|
|
str.remove_prefix(1);
|
|
}
|
|
|
|
if (!IsCharAt(str, 0, ']')) {
|
|
ErrorPolicy::NoteError("failed to spot ] at the end of char-set term", str);
|
|
return {{}, ""};
|
|
}
|
|
|
|
if (!pred) {
|
|
pred = [negated](char candidate) { return negated; };
|
|
} else if (negated) {
|
|
pred = [pred = std::move(pred)](char candidate) { return !pred(candidate); };
|
|
}
|
|
return {builder.CharP(std::move(pred)), str.substr(1)};
|
|
}
|
|
|
|
// Parses sequence of literals and groups and groups of '*' and '+'
|
|
// expressions.
|
|
ParseResult ParseRun(std::string_view str) {
|
|
if (str.size() == 0) {
|
|
return {{}, str};
|
|
}
|
|
|
|
static constexpr char kSpecials[] = "()[]{}.*|\\?+^$";
|
|
static constexpr const char* kSpecialsEnd = kSpecials + sizeof(kSpecials) - 1;
|
|
|
|
size_t i;
|
|
for (i = 0; i < str.size(); i++) {
|
|
char ch = str[i];
|
|
if (std::find(kSpecials, kSpecialsEnd, ch) != kSpecialsEnd) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (i) {
|
|
// we got literal
|
|
if (i > 1 && (IsCharAt(str, i, '*') || IsCharAt(str, i, '+') || IsCharAt(str, i, '?'))) {
|
|
// only last char of literal char runs will be '*'-ed. So lets
|
|
// be careful
|
|
i--;
|
|
}
|
|
// we got literal. Lets try to concat it with possible '*' and next run
|
|
return MaybeStar(builder.Lit(str.substr(0, i)), str.substr(i));
|
|
}
|
|
|
|
char first = str[0];
|
|
if (first == '\\' && str.size() > 1) {
|
|
std::string_view literal;
|
|
if (str[1] == 'n') {
|
|
literal = "\n";
|
|
} else if (str[1] == 't') {
|
|
literal = "\t";
|
|
} else if (str[1] == ' ') {
|
|
literal = " ";
|
|
} else if (auto place = std::find(kSpecials, kSpecialsEnd, str[1]); place != kSpecialsEnd) {
|
|
literal = {place, 1};
|
|
} else {
|
|
// Failure to parse
|
|
return {{}, str};
|
|
}
|
|
return MaybeStar(builder.Lit(literal), str.substr(2));
|
|
}
|
|
if (first == '^') {
|
|
return MaybeStar(builder.LineStart(), str.substr(1));
|
|
}
|
|
if (first == '$') {
|
|
return MaybeStar(builder.LineEnd(), str.substr(1));
|
|
}
|
|
if (first == '.') {
|
|
return MaybeStar(builder.Dot(), str.substr(1));
|
|
}
|
|
if (first == '[') {
|
|
return CompileCharSet(str.substr(1));
|
|
}
|
|
|
|
if (first == '(') {
|
|
auto [maybe_nested, new_str] = ParseAlt(str.substr(1));
|
|
|
|
if (!IsCharAt(new_str, 0, ')')) {
|
|
ErrorPolicy::NoteError("failed to spot ) at the end of group term", new_str);
|
|
return {{}, ""};
|
|
}
|
|
|
|
if (maybe_nested) {
|
|
return MaybeStar(std::move(maybe_nested.value()), new_str.substr(1));
|
|
}
|
|
|
|
// empty group. We just ignore it. But lets also handle possible
|
|
// '*' after it (which we also eat)
|
|
if (IsCharAt(new_str, 1, '*')) {
|
|
new_str.remove_prefix(1);
|
|
}
|
|
return ParseRun(new_str.substr(1));
|
|
}
|
|
|
|
// Likely '|', ')' or parse error
|
|
return {{}, str};
|
|
}
|
|
|
|
// Sequences left then right or just left if right is missing).
|
|
Matcher MaybeSeq(Matcher left, std::optional<Matcher> right) {
|
|
if (right) {
|
|
return builder.Seq(std::move(left), std::move(right.value()));
|
|
}
|
|
return left;
|
|
}
|
|
|
|
// Builds matcher for '+' expression.
|
|
Matcher MakePlus(Matcher nested) { return builder.Seq(nested, builder.Star(nested)); }
|
|
|
|
// Given regex matcher, check if it is followed by '*' or '+' and
|
|
// wrap it if needed, then continue gathering sequence of matches
|
|
// (see ParseRun)
|
|
ParseResult MaybeStar(Matcher left, std::string_view str) {
|
|
if (IsCharAt(str, 0, '*')) {
|
|
left = builder.Star(std::move(left));
|
|
str.remove_prefix(1);
|
|
if (IsCharAt(str, 0, '?')) {
|
|
// We don't produce actual matching, so there is not
|
|
// difference between lazy and eager matching. But lets
|
|
// support the syntax anyways, by ignoring lazyness marker
|
|
str.remove_prefix(1);
|
|
}
|
|
}
|
|
if (IsCharAt(str, 0, '+')) {
|
|
left = MakePlus(std::move(left));
|
|
str.remove_prefix(1);
|
|
if (IsCharAt(str, 0, '?')) {
|
|
// We don't produce actual matching, so there is not
|
|
// difference between lazy and eager matching. But lets
|
|
// support the syntax anyways, by ignoring lazyness marker
|
|
str.remove_prefix(1);
|
|
}
|
|
}
|
|
if (IsCharAt(str, 0, '?')) {
|
|
left = builder.Alt(builder.Any(), std::move(left));
|
|
str.remove_prefix(1);
|
|
}
|
|
auto [maybe_right, new_str] = ParseRun(str);
|
|
return {MaybeSeq(left, std::move(maybe_right)), new_str};
|
|
}
|
|
|
|
Matcher CompileOrDie(std::string_view str) {
|
|
ErrorPolicy::StartedParsing(str);
|
|
auto [maybe_m, new_str] = ParseAlt(str);
|
|
if (!new_str.empty()) {
|
|
ErrorPolicy::NoteError("failed to parse entire re string", new_str);
|
|
}
|
|
if (!maybe_m) {
|
|
return builder.Any();
|
|
}
|
|
return maybe_m.value();
|
|
}
|
|
};
|
|
|
|
} // namespace re_compiler
|
|
|
|
inline Matcher CompileREOrDie(std::string_view str) { return re_compiler::C<matchers::MatcherBuilder>({}).CompileOrDie(str); }
|
|
|
|
} // namespace trivialre
|
|
|
|
#endif // TRIVIALRE_H_
|