Pigweed
 
Loading...
Searching...
No Matches
detokenize.h
1// Copyright 2020 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15// This file provides the Detokenizer class, which is used to decode tokenized
16// strings. To use a Detokenizer, load a binary format token database into
17// memory, construct a TokenDatabase, and pass it to a Detokenizer:
18//
19// std::vector data = ReadFile("my_tokenized_strings.db");
20// Detokenizer detok(TokenDatabase::Create(data));
21//
22// DetokenizedString result = detok.Detokenize(my_data);
23// std::cout << result.BestString() << '\n';
24//
25#pragma once
26
27#include <cstddef>
28#include <cstdint>
29#include <string>
30#include <unordered_map>
31#include <utility>
32#include <vector>
33
34#include "pw_result/result.h"
35#include "pw_span/span.h"
36#include "pw_stream/stream.h"
37#include "pw_tokenizer/internal/decode.h"
38#include "pw_tokenizer/token_database.h"
39
40namespace pw::tokenizer {
41
44
46using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
47using DomainTokenEntriesMap = std::unordered_map<
48 std::string,
49 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>;
50
54 public:
55 DetokenizedString(uint32_t token,
56 const span<const TokenizedStringEntry>& entries,
57 const span<const std::byte>& arguments);
58
59 DetokenizedString() : has_token_(false) {}
60
62 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }
63
65 const std::vector<DecodedFormatString>& matches() const { return matches_; }
66
67 const uint32_t& token() const { return token_; }
68
72 std::string BestString() const;
73
76 std::string BestStringWithErrors() const;
77
78 private:
79 uint32_t token_;
80 bool has_token_;
81 std::vector<DecodedFormatString> matches_;
82};
83
87 public:
91 explicit Detokenizer(const TokenDatabase& database);
92
94 explicit Detokenizer(
95 std::unordered_map<
96 std::string,
97 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>&&
98 database)
99 : database_(std::move(database)) {}
100
103 static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);
104
106 static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
107 return FromElfSection(as_bytes(elf_section));
108 }
109
112 static Result<Detokenizer> FromElfFile(stream::SeekableReader& stream);
113
115 static Result<Detokenizer> FromCsv(std::string_view csv);
116
119 DetokenizedString Detokenize(const span<const std::byte>& encoded) const;
120
122 DetokenizedString Detokenize(const span<const uint8_t>& encoded) const {
123 return Detokenize(as_bytes(encoded));
124 }
125
127 DetokenizedString Detokenize(std::string_view encoded) const {
128 return Detokenize(encoded.data(), encoded.size());
129 }
130
132 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
133 return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes));
134 }
135
138 DetokenizedString DetokenizeBase64Message(std::string_view text) const;
139
153 std::string DetokenizeText(std::string_view text,
154 unsigned max_passes = 3) const;
155
158 [[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64(
159 std::string_view text) const {
160 return DetokenizeText(text, 1);
161 }
162
177 const span<const std::byte>& optionally_tokenized_data);
178
179 const DomainTokenEntriesMap& database() const { return database_; }
180
181 private:
182 DomainTokenEntriesMap database_;
183};
184
186
187} // namespace pw::tokenizer
Definition: stream.h:394
Definition: detokenize.h:53
const std::vector< DecodedFormatString > & matches() const
Returns the strings that matched the token, with the best matches first.
Definition: detokenize.h:65
std::string BestString() const
bool ok() const
True if there was only one valid match and it decoded successfully.
Definition: detokenize.h:62
std::string BestStringWithErrors() const
Definition: detokenize.h:86
DetokenizedString Detokenize(const span< const uint8_t > &encoded) const
Overload of Detokenize for span<const uint8_t>.
Definition: detokenize.h:122
DetokenizedString Detokenize(std::string_view encoded) const
Overload of Detokenize for std::string_view.
Definition: detokenize.h:127
static Result< Detokenizer > FromElfFile(stream::SeekableReader &stream)
static Result< Detokenizer > FromCsv(std::string_view csv)
Constructs a detokenizer from a parsed CSV database.
std::string DetokenizeBase64(std::string_view text) const
Definition: detokenize.h:158
DetokenizedString DetokenizeBase64Message(std::string_view text) const
static Result< Detokenizer > FromElfSection(span< const uint8_t > elf_section)
Overload of FromElfSection for a uint8_t span.
Definition: detokenize.h:106
Detokenizer(const TokenDatabase &database)
Detokenizer(std::unordered_map< std::string, std::unordered_map< uint32_t, std::vector< TokenizedStringEntry > > > &&database)
Constructs a detokenizer by directly passing the parsed database.
Definition: detokenize.h:94
std::string DetokenizeText(std::string_view text, unsigned max_passes=3) const
DetokenizedString Detokenize(const void *encoded, size_t size_bytes) const
Overload of Detokenize for a pointer and length.
Definition: detokenize.h:132
static Result< Detokenizer > FromElfSection(span< const std::byte > elf_section)
DetokenizedString Detokenize(const span< const std::byte > &encoded) const
std::string DecodeOptionallyTokenizedData(const span< const std::byte > &optionally_tokenized_data)
Definition: token_database.h:75
std::pair< FormatString, uint32_t > TokenizedStringEntry
Token database entry.
Definition: detokenize.h:46