11#include "third_party/abseil-cpp/absl/log/log.h"
12#include "third_party/abseil-cpp/absl/strings/str_cat.h"
14namespace fs = std::filesystem;
17bool Overlaps(std::string_view a, std::string_view b) {
18 const char*
const start1 = a.data();
19 const char*
const end1 = start1 + a.size();
20 const char*
const start2 =
b.data();
21 const char*
const end2 = start2 +
b.size();
23 return start1 < end2 && start2 < end1;
26bool EndsWith(std::string_view str, std::string_view suffix) {
27 if (suffix.length() > str.length()) {
30 return str.substr(str.length() - suffix.length()) == suffix;
33std::string IgnoreWhitespace(std::string_view
input) {
34 bool in_whitespace =
false;
35 std::string result =
"";
36 for (
size_t i = 0;
i <
input.size(); ++
i) {
38 if (std::isspace(current)) {
40 result.append(
"\\s+");
44 result.push_back(current);
45 in_whitespace =
false;
48 if (EndsWith(result,
"\\s+")) {
49 result.erase(result.end() - 3, result.end());
54std::optional<Catalog::Match> FindMatchForSelectedMatcher(
55 std::string_view query,
57 std::string_view matcher_name) {
58 int num_groups = matcher->NumberOfCapturingGroups();
60 if (num_groups == 0) {
61 std::string_view match_text;
62 if (matcher->Match(query, 0, query.length(), RE2::Anchor::UNANCHORED,
69 std::vector<re2::StringPiece> submatches(num_groups + 1);
70 if (matcher->Match(query, 0, query.length(), RE2::Anchor::UNANCHORED,
71 submatches.data(), num_groups + 1)) {
72 std::string_view full_match = submatches[0];
73 const char* full_match_end = full_match.data() + full_match.size();
75 std::string non_group_text;
76 non_group_text.reserve(full_match.size());
77 const char* position = full_match.data();
78 for (
int i = 1;
i <= num_groups; ++
i) {
79 std::string_view submatch = submatches[
i];
80 if (submatch.data() ==
nullptr) {
83 if (submatch.data() > position) {
84 non_group_text.append(position, submatch.data() - position);
86 position = submatch.data() + submatch.size();
88 if (position !=
nullptr && position < full_match_end) {
89 non_group_text.append(position, full_match_end - position);
93 std::move(non_group_text));
102 fs::path data_dir_path(data_dir);
103 if (!fs::exists(data_dir_path)) {
104 return absl::InvalidArgumentError(
105 absl::StrCat(
"Data directory doesn't exist ", data_dir));
107 fs::path licenses_path = data_dir_path /
"licenses";
108 if (!fs::exists(licenses_path)) {
109 return absl::InvalidArgumentError(absl::StrCat(
110 "Licenses directory doesn't exist ", licenses_path.string()));
113 RE2::Set selector(RE2::Options(), RE2::Anchor::UNANCHORED);
114 std::vector<std::unique_ptr<RE2>> matchers;
115 std::vector<std::string> names;
117 for (
const fs::path& file : fs::directory_iterator(licenses_path)) {
118 std::ifstream infile(file.string());
119 if (!infile.good()) {
120 return absl::InvalidArgumentError(
"Unable to open file " + file.string());
123 absl::StatusOr<Entry> entry =
ParseEntry(infile);
125 return absl::InvalidArgumentError(
126 absl::StrCat(
"Unable to parse data entry at ", file.string(),
" : ",
131 selector.Add(entry->unique, &err);
133 return absl::InvalidArgumentError(absl::StrCat(
134 "Unable to add unique key from ", file.string(),
" : ", err));
136 names.emplace_back(std::move(entry->name));
138 auto matcher_re2 = std::make_unique<RE2>(entry->matcher);
140 return absl::InvalidArgumentError(
"Unable to make matcher.");
143 matchers.emplace_back(std::move(matcher_re2));
146 bool did_compile = selector.Compile();
148 return absl::UnknownError(
"Unable to compile selector.");
151 return Catalog(std::move(selector), std::move(matchers), std::move(names));
155 RE2::Set selector(RE2::Options(), RE2::Anchor::UNANCHORED);
156 std::vector<std::unique_ptr<RE2>> matchers;
157 std::vector<std::string> names;
159 for (
const Entry& entry : entries) {
161 names.push_back(std::string(entry.name));
162 int idx = selector.Add(entry.unique, &err);
164 return absl::InvalidArgumentError(
165 absl::StrCat(
"Unable to add set entry: ", entry.unique,
" ", err));
167 matchers.push_back(std::make_unique<RE2>(entry.matcher));
170 bool did_compile = selector.Compile();
172 return absl::OutOfRangeError(
"RE2::Set ran out of memory.");
174 return Catalog(std::move(selector), std::move(matchers), std::move(names));
177Catalog::Catalog(RE2::Set selector,
178 std::vector<std::unique_ptr<RE2>> matchers,
179 std::vector<std::string> names)
180 : selector_(
std::move(selector)),
181 matchers_(
std::move(matchers)),
182 names_(
std::move(names)) {}
187 std::string_view query)
const {
188 std::vector<int> selector_results;
189 if (!selector_.Match(query, &selector_results)) {
190 return absl::NotFoundError(
"Selector didn't match.");
193 std::vector<Catalog::Match> results;
194 std::vector<int> missed_results;
195 missed_results.reserve(selector_results.size());
196 std::vector<int> hit_results;
197 hit_results.reserve(selector_results.size());
198 for (
int selector_result : selector_results) {
199 RE2* matcher = matchers_[selector_result].get();
200 std::optional<Match> match =
201 FindMatchForSelectedMatcher(query, matcher, names_[selector_result]);
202 if (match.has_value()) {
203 results.emplace_back(std::move(match.value()));
204 hit_results.push_back(selector_result);
206 missed_results.push_back(selector_result);
209 if (selector_results.size() != results.size()) {
210 std::stringstream missed;
211 for (
size_t i = 0;
i < missed_results.size(); ++
i) {
215 missed << names_[missed_results[
i]];
217 std::stringstream hit;
218 hit <<
" Hit matcher(s): (";
219 for (
size_t i = 0;
i < hit_results.size(); ++
i) {
223 hit << names_[hit_results[
i]];
226 return absl::NotFoundError(
227 absl::StrCat(
"Selected matcher(s) (", missed.str(),
") didn't match.",
228 hit_results.empty() ?
"" : hit.str()));
230 for (
size_t i = 0;
i < results.size(); ++
i) {
231 for (
size_t j =
i + 1; j < results.size(); ++j) {
232 if (Overlaps(results[
i].GetMatchedText(),
233 results[j].GetMatchedText())) {
234 return absl::InvalidArgumentError(absl::StrCat(
235 "Selected matchers overlap (", results[
i].GetMatcher(),
", ",
236 results[j].GetMatcher(),
").\n", results[
i].GetMatchedText(),
237 "\n############\n", results[j].GetMatchedText()));
248 return absl::InvalidArgumentError(
"Bad stream.");
251 std::getline(is,
name);
253 return absl::InvalidArgumentError(
"Bad stream.");
256 std::getline(is, unique);
258 return absl::InvalidArgumentError(
"Bad stream.");
261 std::string matcher_text((std::istreambuf_iterator<char>(is)),
262 std::istreambuf_iterator<char>());
264 std::string ignore_whitespace_matcher = IgnoreWhitespace(matcher_text);
266 VLOG(4) <<
"matcher:" <<
name <<
":\n" << ignore_whitespace_matcher;
269 .unique = std::move(unique),
270 .matcher = std::move(ignore_whitespace_matcher)};
static Match MakeWithView(std::string_view matcher, std::string_view matched_text)
static Match MakeWithString(std::string_view matcher, std::string matched_text)
static absl::StatusOr< Entry > ParseEntry(std::istream &is)
VisibleForTesting.
static absl::StatusOr< Catalog > Open(std::string_view data_dir)
absl::StatusOr< std::vector< Match > > FindMatch(std::string_view query) const
Tries to identify a match for the query across the Catalog.
static absl::StatusOr< Catalog > Make(const std::vector< Entry > &entries)
Make a Catalog for testing.