Pigweed
C/C++ API Reference
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
detokenize.h
1// Copyright 2020 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15// This file provides the Detokenizer class, which is used to decode tokenized
16// strings. To use a Detokenizer, load a binary format token database into
17// memory, construct a TokenDatabase, and pass it to a Detokenizer:
18//
19// std::vector data = ReadFile("my_tokenized_strings.db");
20// Detokenizer detok(TokenDatabase::Create(data));
21//
22// DetokenizedString result = detok.Detokenize(my_data);
23// std::cout << result.BestString() << '\n';
24//
25#pragma once
26
27#include <cstddef>
28#include <cstdint>
29#include <string>
30#include <unordered_map>
31#include <utility>
32#include <vector>
33
34#include "pw_result/result.h"
35#include "pw_span/span.h"
36#include "pw_stream/stream.h"
37#include "pw_tokenizer/internal/decode.h"
38#include "pw_tokenizer/token_database.h"
39#include "pw_tokenizer/tokenize.h"
40
41namespace pw::tokenizer {
42
45
46class Detokenizer;
47
49using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
50using DomainTokenEntriesMap = std::unordered_map<
51 std::string,
52 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>;
53
57 public:
58 DetokenizedString(const Detokenizer& detokenizer,
59 bool recursion,
60 uint32_t token,
61 const span<const TokenizedStringEntry>& entries,
62 const span<const std::byte>& arguments);
63
64 DetokenizedString() : has_token_(false) {}
65
67 bool ok() const {
68 bool successful_decode = false;
69 for (const auto& match : matches_) {
70 if (match.ok()) {
71 if (successful_decode) {
72 return false;
73 }
74 successful_decode = true;
75 }
76 }
77
78 return successful_decode;
79 }
80
82 const std::vector<DecodedFormatString>& matches() const { return matches_; }
83
84 const uint32_t& token() const { return token_; }
85
89 const std::string& BestString() const { return best_string_; }
90
93 std::string BestStringWithErrors() const;
94
95 private:
96 uint32_t token_;
97 std::string best_string_;
98 bool has_token_;
99 std::vector<DecodedFormatString> matches_;
100};
101
105 public:
109 explicit Detokenizer(const TokenDatabase& database);
110
112 explicit Detokenizer(DomainTokenEntriesMap&& database)
113 : database_(std::move(database)) {}
114
117 static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);
118
120 static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
121 return FromElfSection(as_bytes(elf_section));
122 }
123
126 static Result<Detokenizer> FromElfFile(stream::SeekableReader& stream);
127
129 static Result<Detokenizer> FromCsv(std::string_view csv);
130
133 DetokenizedString Detokenize(const span<const std::byte>& encoded,
134 std::string_view domain = kDefaultDomain) const {
135 return Detokenize(encoded, domain, false);
136 }
137
139 DetokenizedString Detokenize(const span<const uint8_t>& encoded,
140 std::string_view domain = kDefaultDomain) const {
141 return Detokenize(as_bytes(encoded), domain);
142 }
143
145 DetokenizedString Detokenize(std::string_view encoded,
146 std::string_view domain = kDefaultDomain) const {
147 return Detokenize(encoded.data(), encoded.size(), domain);
148 }
149
151 DetokenizedString Detokenize(const void* encoded,
152 size_t size_bytes,
153 std::string_view domain = kDefaultDomain) const {
154 return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes),
155 domain);
156 }
157
161 const span<const std::byte>& encoded,
162 std::string_view domain = kDefaultDomain) const {
163 return Detokenize(encoded, domain, true);
164 }
165
168 const span<const uint8_t>& encoded,
169 std::string_view domain = kDefaultDomain) const {
170 return RecursiveDetokenize(as_bytes(encoded), domain);
171 }
172
175 std::string_view encoded,
176 std::string_view domain = kDefaultDomain) const {
177 return RecursiveDetokenize(encoded.data(), encoded.size(), domain);
178 }
179
182 const void* encoded,
183 size_t size_bytes,
184 std::string_view domain = kDefaultDomain) const {
185 return RecursiveDetokenize(
186 span(static_cast<const std::byte*>(encoded), size_bytes), domain);
187 }
188
191 DetokenizedString DetokenizeBase64Message(std::string_view text) const;
192
202 std::string DetokenizeText(std::string_view text) const {
203 return DetokenizeTextRecursive(text, kMaxDecodePasses);
204 }
205
220 const span<const std::byte>& optionally_tokenized_data);
221
222 const DomainTokenEntriesMap& database() const { return database_; }
223
224 span<const TokenizedStringEntry> DatabaseLookup(
225 uint32_t token, std::string_view domain) const;
226
227 private:
228 // 4 passes supports detokenizing two layers of nested messages with tokenized
229 // domains (e.g. ${${bar}#ab12cd34}#00000012), without allowing a hypothetical
230 // detokenization cycle to continue for too long.
231 static constexpr unsigned kMaxDecodePasses = 4;
232
233 std::string DetokenizeTextRecursive(std::string_view text,
234 unsigned max_passes) const;
235
238 DetokenizedString Detokenize(const span<const std::byte>& encoded,
239 std::string_view domain,
240 bool recursion) const;
241
242 DomainTokenEntriesMap database_;
243};
244
246
247} // namespace pw::tokenizer
Definition: stream.h:394
Definition: detokenize.h:56
const std::vector< DecodedFormatString > & matches() const
Returns the strings that matched the token, with the best matches first.
Definition: detokenize.h:82
bool ok() const
True if there was only one match that decoded successfully.
Definition: detokenize.h:67
std::string BestStringWithErrors() const
const std::string & BestString() const
Definition: detokenize.h:89
Definition: detokenize.h:104
DetokenizedString Detokenize(const span< const std::byte > &encoded, std::string_view domain=kDefaultDomain) const
Definition: detokenize.h:133
DetokenizedString Detokenize(const span< const uint8_t > &encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for span<const uint8_t>.
Definition: detokenize.h:139
std::string DetokenizeText(std::string_view text) const
Definition: detokenize.h:202
static Result< Detokenizer > FromElfFile(stream::SeekableReader &stream)
static Result< Detokenizer > FromCsv(std::string_view csv)
Constructs a detokenizer from a CSV database.
DetokenizedString RecursiveDetokenize(const span< const std::byte > &encoded, std::string_view domain=kDefaultDomain) const
Definition: detokenize.h:160
DetokenizedString DetokenizeBase64Message(std::string_view text) const
static Result< Detokenizer > FromElfSection(span< const uint8_t > elf_section)
Overload of FromElfSection for a uint8_t span.
Definition: detokenize.h:120
Detokenizer(const TokenDatabase &database)
DetokenizedString Detokenize(const void *encoded, size_t size_bytes, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for a pointer and length.
Definition: detokenize.h:151
Detokenizer(DomainTokenEntriesMap &&database)
Constructs a detokenizer by directly passing the parsed database.
Definition: detokenize.h:112
DetokenizedString Detokenize(std::string_view encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for std::string_view.
Definition: detokenize.h:145
DetokenizedString RecursiveDetokenize(const span< const uint8_t > &encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for span<const uint8_t>.
Definition: detokenize.h:167
static Result< Detokenizer > FromElfSection(span< const std::byte > elf_section)
DetokenizedString RecursiveDetokenize(const void *encoded, size_t size_bytes, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for a pointer and length.
Definition: detokenize.h:181
DetokenizedString RecursiveDetokenize(std::string_view encoded, std::string_view domain=kDefaultDomain) const
Overload of Detokenize for std::string_view.
Definition: detokenize.h:174
std::string DecodeOptionallyTokenizedData(const span< const std::byte > &optionally_tokenized_data)
Definition: token_database.h:75
std::pair< FormatString, uint32_t > TokenizedStringEntry
Token database entry.
Definition: detokenize.h:49