C/C++ API Reference
Loading...
Searching...
No Matches
detokenize.h
1// Copyright 2020 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15// This file provides the Detokenizer class, which is used to decode tokenized
16// strings. To use a Detokenizer, load a binary format token database into
17// memory, construct a TokenDatabase, and pass it to a Detokenizer:
18//
19// std::vector data = ReadFile("my_tokenized_strings.db");
20// Detokenizer detok(TokenDatabase::Create(data));
21//
22// DetokenizedString result = detok.Detokenize(my_data);
23// std::cout << result.BestString() << '\n';
24//
25#pragma once
26
27#include <cstddef>
28#include <cstdint>
29#include <string>
30#include <unordered_map>
31#include <utility>
32#include <vector>
33
34#include "pw_result/result.h"
35#include "pw_span/span.h"
36#include "pw_stream/stream.h"
37#include "pw_tokenizer/internal/decode.h"
38#include "pw_tokenizer/token_database.h"
39#include "pw_tokenizer/tokenize.h"
40
41namespace pw::tokenizer {
42
44
45class Detokenizer;
46
48using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
49using DomainTokenEntriesMap = std::unordered_map<
50 std::string,
51 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>;
52
56 public:
57 DetokenizedString(const Detokenizer& detokenizer,
58 bool recursion,
59 uint32_t token,
61 const span<const std::byte>& arguments);
62
63 DetokenizedString() : has_token_(false) {}
64
66 bool ok() const {
67 bool successful_decode = false;
68 for (const auto& match : matches_) {
69 if (match.ok()) {
70 if (successful_decode) {
71 return false;
72 }
73 successful_decode = true;
74 }
75 }
76
77 return successful_decode;
78 }
79
81 const std::vector<DecodedFormatString>& matches() const { return matches_; }
82
83 const uint32_t& token() const { return token_; }
84
88 const std::string& BestString() const { return best_string_; }
89
92 std::string BestStringWithErrors() const;
93
94 private:
95 uint32_t token_;
96 std::string best_string_;
97 bool has_token_;
98 std::vector<DecodedFormatString> matches_;
99};
100
104 public:
108 explicit Detokenizer(const TokenDatabase& database);
109
111 explicit Detokenizer(DomainTokenEntriesMap&& database)
112 : database_(std::move(database)) {}
113
117
120 return FromElfSection(as_bytes(elf_section));
121 }
122
126
128 static Result<Detokenizer> FromCsv(std::string_view csv);
129
133 std::string_view domain = kDefaultDomain) const {
134 return Detokenize(encoded, domain, false);
135 }
136
139 std::string_view domain = kDefaultDomain) const {
140 return Detokenize(as_bytes(encoded), domain);
141 }
142
144 DetokenizedString Detokenize(std::string_view encoded,
145 std::string_view domain = kDefaultDomain) const {
146 return Detokenize(encoded.data(), encoded.size(), domain);
147 }
148
150 DetokenizedString Detokenize(const void* encoded,
151 size_t size_bytes,
152 std::string_view domain = kDefaultDomain) const {
153 return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes),
154 domain);
155 }
156
160 const span<const std::byte>& encoded,
161 std::string_view domain = kDefaultDomain) const {
162 return Detokenize(encoded, domain, true);
163 }
164
167 const span<const uint8_t>& encoded,
168 std::string_view domain = kDefaultDomain) const {
169 return RecursiveDetokenize(as_bytes(encoded), domain);
170 }
171
174 std::string_view encoded,
175 std::string_view domain = kDefaultDomain) const {
176 return RecursiveDetokenize(encoded.data(), encoded.size(), domain);
177 }
178
181 const void* encoded,
182 size_t size_bytes,
183 std::string_view domain = kDefaultDomain) const {
184 return RecursiveDetokenize(
185 span(static_cast<const std::byte*>(encoded), size_bytes), domain);
186 }
187
190 DetokenizedString DetokenizeBase64Message(std::string_view text) const;
191
201 std::string DetokenizeText(std::string_view text) const {
202 return DetokenizeTextRecursive(text, kMaxDecodePasses);
203 }
204
219 const span<const std::byte>& optionally_tokenized_data);
220
221 const DomainTokenEntriesMap& database() const { return database_; }
222
224 uint32_t token, std::string_view domain) const;
225
226 private:
227 // 4 passes supports detokenizing two layers of nested messages with tokenized
228 // domains (e.g. ${${bar}#ab12cd34}#00000012), without allowing a hypothetical
229 // detokenization cycle to continue for too long.
230 static constexpr unsigned kMaxDecodePasses = 4;
231
232 std::string DetokenizeTextRecursive(std::string_view text,
233 unsigned max_passes) const;
234
238 std::string_view domain,
239 bool recursion) const;
240
241 DomainTokenEntriesMap database_;
242};
243
245
246} // namespace pw::tokenizer
Definition: poll.h:25
Definition: span_impl.h:235
Definition: stream.h:400
Definition: detokenize.h:55
const std::vector< DecodedFormatString > & matches() const
Returns the strings that matched the token, with the best matches first.
Definition: detokenize.h:81
bool ok() const
True if there was only one match that decoded successfully.
Definition: detokenize.h:66
std::string BestStringWithErrors() const
const std::string & BestString() const
Definition: detokenize.h:88
Definition: detokenize.h:103
DetokenizedString Detokenize(const span< const std::byte > &encoded, std::string_view domain=kDefaultDomain) const
Definition: detokenize.h:132
DetokenizedString Detokenize(const span< const uint8_t > &encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for span<const uint8_t>.
Definition: detokenize.h:138
std::string DetokenizeText(std::string_view text) const
Definition: detokenize.h:201
static Result< Detokenizer > FromElfFile(stream::SeekableReader &stream)
static Result< Detokenizer > FromCsv(std::string_view csv)
Constructs a detokenizer from a CSV database.
DetokenizedString RecursiveDetokenize(const span< const std::byte > &encoded, std::string_view domain=kDefaultDomain) const
Definition: detokenize.h:159
DetokenizedString DetokenizeBase64Message(std::string_view text) const
static Result< Detokenizer > FromElfSection(span< const uint8_t > elf_section)
Overload of FromElfSection for a uint8_t span.
Definition: detokenize.h:119
Detokenizer(const TokenDatabase &database)
DetokenizedString Detokenize(const void *encoded, size_t size_bytes, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for a pointer and length.
Definition: detokenize.h:150
Detokenizer(DomainTokenEntriesMap &&database)
Constructs a detokenizer by directly passing the parsed database.
Definition: detokenize.h:111
DetokenizedString Detokenize(std::string_view encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for std::string_view.
Definition: detokenize.h:144
DetokenizedString RecursiveDetokenize(const span< const uint8_t > &encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for span<const uint8_t>.
Definition: detokenize.h:166
static Result< Detokenizer > FromElfSection(span< const std::byte > elf_section)
DetokenizedString RecursiveDetokenize(const void *encoded, size_t size_bytes, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for a pointer and length.
Definition: detokenize.h:180
DetokenizedString RecursiveDetokenize(std::string_view encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for std::string_view.
Definition: detokenize.h:173
std::string DecodeOptionallyTokenizedData(const span< const std::byte > &optionally_tokenized_data)
Definition: token_database.h:77
std::pair< FormatString, uint32_t > TokenizedStringEntry
Token database entry.
Definition: detokenize.h:48