11#include "third_party/abseil-cpp/absl/log/log.h"
12#include "third_party/abseil-cpp/absl/strings/str_cat.h"
14namespace fs = std::filesystem;
17bool Overlaps(std::string_view a, std::string_view b) {
18 const char*
const start1 = a.data();
19 const char*
const end1 = start1 + a.size();
20 const char*
const start2 =
b.data();
21 const char*
const end2 = start2 +
b.size();
23 return start1 < end2 && start2 < end1;
26bool EndsWith(std::string_view str, std::string_view suffix) {
27 if (suffix.length() > str.length()) {
30 return str.substr(str.length() - suffix.length()) == suffix;
33std::string IgnoreWhitespace(std::string_view
input) {
34 bool in_whitespace =
false;
35 std::string result =
"";
36 for (
size_t i = 0;
i <
input.size(); ++
i) {
38 if (std::isspace(current)) {
40 result.append(
"\\s+");
44 result.push_back(current);
45 in_whitespace =
false;
48 if (EndsWith(result,
"\\s+")) {
49 result.erase(result.end() - 3, result.end());
54std::optional<Catalog::Match> FindMatchForSelectedMatcher(
55 std::string_view query,
57 std::string_view matcher_name) {
58 int num_groups = matcher->NumberOfCapturingGroups();
60 if (num_groups == 0) {
61 std::string_view match_text;
62 if (matcher->Match(query, 0, query.length(), RE2::Anchor::UNANCHORED,
69 std::vector<re2::StringPiece> submatches(num_groups + 1);
70 if (matcher->Match(query, 0, query.length(), RE2::Anchor::UNANCHORED,
71 submatches.data(), num_groups + 1)) {
72 std::string_view full_match = submatches[0];
73 const char* full_match_end = full_match.data() + full_match.size();
75 std::string non_group_text;
76 non_group_text.reserve(full_match.size());
77 const char* position = full_match.data();
78 for (
int i = 1;
i <= num_groups; ++
i) {
79 std::string_view submatch = submatches[
i];
80 if (submatch.data() > position) {
81 non_group_text.append(position, submatch.data() - position);
83 position = submatch.data() + submatch.size();
85 if (position < full_match_end) {
86 non_group_text.append(position, full_match_end - position);
90 std::move(non_group_text));
99 fs::path data_dir_path(data_dir);
100 if (!fs::exists(data_dir_path)) {
101 return absl::InvalidArgumentError(
102 absl::StrCat(
"Data directory doesn't exist ", data_dir));
104 fs::path licenses_path = data_dir_path /
"licenses";
105 if (!fs::exists(licenses_path)) {
106 return absl::InvalidArgumentError(absl::StrCat(
107 "Licenses directory doesn't exist ", licenses_path.string()));
110 RE2::Set selector(RE2::Options(), RE2::Anchor::UNANCHORED);
111 std::vector<std::unique_ptr<RE2>> matchers;
112 std::vector<std::string> names;
114 for (
const fs::path& file : fs::directory_iterator(licenses_path)) {
115 std::ifstream infile(file.string());
116 if (!infile.good()) {
117 return absl::InvalidArgumentError(
"Unable to open file " + file.string());
120 absl::StatusOr<Entry> entry =
ParseEntry(infile);
122 return absl::InvalidArgumentError(
123 absl::StrCat(
"Unable to parse data entry at ", file.string(),
" : ",
128 selector.Add(entry->unique, &err);
130 return absl::InvalidArgumentError(absl::StrCat(
131 "Unable to add unique key from ", file.string(),
" : ", err));
133 names.emplace_back(std::move(entry->name));
135 auto matcher_re2 = std::make_unique<RE2>(entry->matcher);
137 return absl::InvalidArgumentError(
"Unable to make matcher.");
140 matchers.emplace_back(std::move(matcher_re2));
143 bool did_compile = selector.Compile();
145 return absl::UnknownError(
"Unable to compile selector.");
148 return Catalog(std::move(selector), std::move(matchers), std::move(names));
152 RE2::Set selector(RE2::Options(), RE2::Anchor::UNANCHORED);
153 std::vector<std::unique_ptr<RE2>> matchers;
154 std::vector<std::string> names;
156 for (
const Entry& entry : entries) {
158 names.push_back(std::string(entry.name));
159 int idx = selector.Add(entry.unique, &err);
161 return absl::InvalidArgumentError(
162 absl::StrCat(
"Unable to add set entry: ", entry.unique,
" ", err));
164 matchers.push_back(std::make_unique<RE2>(entry.matcher));
167 bool did_compile = selector.Compile();
169 return absl::OutOfRangeError(
"RE2::Set ran out of memory.");
171 return Catalog(std::move(selector), std::move(matchers), std::move(names));
174Catalog::Catalog(RE2::Set selector,
175 std::vector<std::unique_ptr<RE2>> matchers,
176 std::vector<std::string> names)
177 : selector_(
std::move(selector)),
178 matchers_(
std::move(matchers)),
179 names_(
std::move(names)) {}
184 std::string_view query)
const {
185 std::vector<int> selector_results;
186 if (!selector_.Match(query, &selector_results)) {
187 return absl::NotFoundError(
"Selector didn't match.");
190 std::vector<Catalog::Match> results;
191 std::vector<int> missed_results;
192 missed_results.reserve(selector_results.size());
193 std::vector<int> hit_results;
194 hit_results.reserve(selector_results.size());
195 for (
int selector_result : selector_results) {
196 RE2* matcher = matchers_[selector_result].get();
197 std::optional<Match> match =
198 FindMatchForSelectedMatcher(query, matcher, names_[selector_result]);
199 if (match.has_value()) {
200 results.emplace_back(std::move(match.value()));
201 hit_results.push_back(selector_result);
203 missed_results.push_back(selector_result);
206 if (selector_results.size() != results.size()) {
207 std::stringstream missed;
208 for (
size_t i = 0;
i < missed_results.size(); ++
i) {
212 missed << names_[missed_results[
i]];
214 std::stringstream hit;
215 hit <<
" Hit matcher(s): (";
216 for (
size_t i = 0;
i < hit_results.size(); ++
i) {
220 hit << names_[hit_results[
i]];
223 return absl::NotFoundError(
224 absl::StrCat(
"Selected matcher(s) (", missed.str(),
") didn't match.",
225 hit_results.empty() ?
"" : hit.str()));
227 for (
size_t i = 0;
i < results.size(); ++
i) {
228 for (
size_t j =
i + 1; j < results.size(); ++j) {
229 if (Overlaps(results[
i].GetMatchedText(),
230 results[j].GetMatchedText())) {
231 return absl::InvalidArgumentError(absl::StrCat(
232 "Selected matchers overlap (", results[
i].GetMatcher(),
", ",
233 results[j].GetMatcher(),
").\n", results[
i].GetMatchedText(),
234 "\n############\n", results[j].GetMatchedText()));
245 return absl::InvalidArgumentError(
"Bad stream.");
248 std::getline(is,
name);
250 return absl::InvalidArgumentError(
"Bad stream.");
253 std::getline(is, unique);
255 return absl::InvalidArgumentError(
"Bad stream.");
258 std::string matcher_text((std::istreambuf_iterator<char>(is)),
259 std::istreambuf_iterator<char>());
261 std::string ignore_whitespace_matcher = IgnoreWhitespace(matcher_text);
263 VLOG(4) <<
"matcher:" <<
name <<
":\n" << ignore_whitespace_matcher;
266 .unique = std::move(unique),
267 .matcher = std::move(ignore_whitespace_matcher)};
static Match MakeWithView(std::string_view matcher, std::string_view matched_text)
static Match MakeWithString(std::string_view matcher, std::string matched_text)
static absl::StatusOr< Entry > ParseEntry(std::istream &is)
VisibleForTesting.
static absl::StatusOr< Catalog > Open(std::string_view data_dir)
absl::StatusOr< std::vector< Match > > FindMatch(std::string_view query) const
Tries to identify a match for the query across the Catalog.
static absl::StatusOr< Catalog > Make(const std::vector< Entry > &entries)
Make a Catalog for testing.