C/C++ API Reference
Loading...
Searching...
No Matches
token_database.h
1// Copyright 2020 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14#pragma once
15
16#include <array>
17#include <cstddef>
18#include <cstdint>
19#include <iterator>
20
21namespace pw::tokenizer {
22
24
78 private:
79 // Internal struct that describes how the underlying binary token database
80 // stores entries. RawEntries generally should not be used directly. Instead,
81 // use an Entry, which contains a pointer to the entry's string.
82 struct RawEntry {
83 uint32_t token;
84 uint32_t date_removed;
85 };
86
87 static_assert(sizeof(RawEntry) == 8u);
88
89 template <typename T>
90 static constexpr uint32_t ReadUint32(const T* bytes) {
91 return static_cast<uint32_t>(static_cast<uint8_t>(bytes[0]) |
92 static_cast<uint8_t>(bytes[1]) << 8 |
93 static_cast<uint8_t>(bytes[2]) << 16 |
94 static_cast<uint8_t>(bytes[3]) << 24);
95 }
96
97 public:
100 static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF;
101
103 struct Entry {
105 uint32_t token;
106
112 uint32_t date_removed;
113
115 const char* string;
116 };
117
119 class iterator {
120 public:
121 using difference_type = std::ptrdiff_t;
122 using value_type = Entry;
123 using pointer = const Entry*;
124 using reference = const Entry&;
125 using iterator_category = std::forward_iterator_tag;
126
127 constexpr iterator() : entry_{}, raw_(nullptr) {}
128
129 constexpr iterator(const iterator& other) = default;
130 constexpr iterator& operator=(const iterator& other) = default;
131
132 constexpr iterator& operator++() {
133 raw_ += sizeof(RawEntry);
134 ReadRawEntry();
135 // Move string_ to the character beyond the next null terminator.
136 while (*entry_.string++ != '\0') {
137 }
138 return *this;
139 }
140 constexpr iterator operator++(int) {
141 iterator previous(*this);
142 operator++();
143 return previous;
144 }
145 constexpr bool operator==(const iterator& rhs) const {
146 return raw_ == rhs.raw_;
147 }
148 constexpr bool operator!=(const iterator& rhs) const {
149 return raw_ != rhs.raw_;
150 }
151
152 constexpr const Entry& operator*() const { return entry_; }
153
154 constexpr const Entry* operator->() const { return &entry_; }
155
156 constexpr difference_type operator-(const iterator& rhs) const {
157 return (raw_ - rhs.raw_) / static_cast<difference_type>(sizeof(RawEntry));
158 }
159
160 private:
161 friend class TokenDatabase;
162
163 // Constructs a new iterator to a valid entry.
164 constexpr iterator(const char* raw_entry, const char* string)
165 : entry_{0, 0, string}, raw_{raw_entry} {
166 if (raw_entry != string) { // raw_entry == string if the DB is empty
167 ReadRawEntry();
168 }
169 }
170
171 explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {}
172
173 constexpr void ReadRawEntry() {
174 entry_.token = ReadUint32(raw_);
175 entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token));
176 }
177
178 Entry entry_;
179 const char* raw_;
180 };
181
182 using value_type = Entry;
183 using size_type = std::size_t;
184 using difference_type = std::ptrdiff_t;
185 using reference = value_type&;
186 using const_reference = const value_type&;
187 using pointer = const value_type*;
188 using const_pointer = const value_type*;
189 using const_iterator = iterator;
190 using reverse_iterator = std::reverse_iterator<iterator>;
191 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
192
195 class Entries {
196 public:
197 constexpr Entries(const iterator& begin, const iterator& end)
198 : begin_(begin), end_(end) {}
199
200 // The number of entries in this list.
201 constexpr size_type size() const {
202 return static_cast<size_type>(end_ - begin_);
203 }
204
205 // True of the list is empty.
206 constexpr bool empty() const { return begin_ == end_; }
207
208 // Accesses the specified entry in this set. The index must be less than
209 // size(). This operation is O(n) in size().
210 Entry operator[](size_type index) const;
211
212 constexpr const iterator& begin() const { return begin_; }
213 constexpr const iterator& end() const { return end_; }
214
215 private:
216 iterator begin_;
217 iterator end_;
218 };
219
224 template <typename ByteArray>
225 static constexpr bool IsValid(const ByteArray& bytes) {
226 return HasValidHeader(bytes) && EachEntryHasAString(bytes);
227 }
228
239 template <const auto& kDatabaseBytes>
240 static constexpr TokenDatabase Create() {
241 static_assert(
242 HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
243 "Databases must start with a 16-byte header that begins with TOKENS.");
244
245 static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
246 "The database must have at least one string for each entry.");
247
248 return TokenDatabase(std::data(kDatabaseBytes));
249 }
250
258 template <typename ByteArray>
259 static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
260 return IsValid<ByteArray>(database_bytes)
261 ? TokenDatabase(std::data(database_bytes))
262 : TokenDatabase(); // Invalid database.
263 }
265 constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
266
268 Entries Find(uint32_t token) const;
269
271 constexpr size_type size() const {
272 return static_cast<size_type>(end_.data - begin_.data) / sizeof(RawEntry);
273 }
274
277 constexpr bool ok() const { return begin_.data != nullptr; }
278
280 constexpr iterator begin() const { return iterator(begin_.data, end_.data); }
281
283 constexpr iterator end() const { return iterator(end_.data); }
284
285 private:
286 struct Header {
287 std::array<char, 6> magic;
288 uint16_t version;
289 uint32_t entry_count;
290 uint32_t reserved;
291 };
292
293 static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
294
295 template <typename ByteArray>
296 static constexpr bool HasValidHeader(const ByteArray& bytes) {
297 static_assert(sizeof(*std::data(bytes)) == 1u);
298
299 if (std::size(bytes) < sizeof(Header)) {
300 return false;
301 }
302
303 // Check the magic number and version.
304 for (size_type i = 0; i < kMagicAndVersion.size(); ++i) {
305 if (bytes[i] != kMagicAndVersion[i]) {
306 return false;
307 }
308 }
309
310 return true;
311 }
312
313 template <typename ByteArray>
314 static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
315 const size_type entries = ReadEntryCount(std::data(bytes));
316
317 // Check that the data is large enough to have a string table.
318 if (std::size(bytes) < StringTable(entries)) {
319 return false;
320 }
321
322 // Count the strings in the string table.
323 size_type string_count = 0;
324 for (auto i =
325 std::begin(bytes) + static_cast<ptrdiff_t>(StringTable(entries));
326 i < std::end(bytes);
327 ++i) {
328 string_count += (*i == '\0') ? 1 : 0;
329 }
330
331 // Check that there is at least one string for each entry.
332 return string_count >= entries;
333 }
334
335 // Reads the number of entries from a database header. Cast to the bytes to
336 // uint8_t to avoid sign extension if T is signed.
337 template <typename T>
338 static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
339 const T* bytes = header_bytes + offsetof(Header, entry_count);
340 return ReadUint32(bytes);
341 }
342
343 // Calculates the offset of the string table.
344 static constexpr size_type StringTable(size_type entries) {
345 return sizeof(Header) + entries * sizeof(RawEntry);
346 }
347
348 // The magic number that starts the table is "TOKENS". The version is encoded
349 // next as two bytes.
350 static constexpr std::array<char, 8> kMagicAndVersion = {
351 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
352
353 template <typename Byte>
354 constexpr TokenDatabase(const Byte bytes[])
355 : TokenDatabase(bytes + sizeof(Header),
356 bytes + StringTable(ReadEntryCount(bytes))) {
357 static_assert(sizeof(Byte) == 1u);
358 }
359
360 // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
361 // use unions. Instead of using a reinterpret_cast to change the byte pointer
362 // to a RawEntry pointer, have a separate overload for each byte pointer type
363 // and store them in a union.
364 constexpr TokenDatabase(const char* begin, const char* end)
365 : begin_{.data = begin}, end_{.data = end} {}
366
367 constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
368 : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
369
370 constexpr TokenDatabase(const signed char* begin, const signed char* end)
371 : begin_{.signed_data = begin}, end_{.signed_data = end} {}
372
373 // Store the beginning and end pointers as a union to avoid breaking constexpr
374 // rules for reinterpret_cast.
375 union {
376 const char* data;
377 const unsigned char* unsigned_data;
378 const signed char* signed_data;
379 } begin_, end_;
380};
381
383
384} // namespace pw::tokenizer
Definition: token_database.h:195
Iterator for TokenDatabase values.
Definition: token_database.h:119
Definition: token_database.h:77
static constexpr uint32_t kDateRemovedNever
Definition: token_database.h:100
constexpr TokenDatabase()
Creates a database with no data. ok() returns false.
Definition: token_database.h:265
static constexpr TokenDatabase Create(const ByteArray &database_bytes)
Definition: token_database.h:259
constexpr iterator begin() const
Returns an iterator for the first token entry.
Definition: token_database.h:280
constexpr size_type size() const
Returns the total number of entries (unique token-string pairs).
Definition: token_database.h:271
static constexpr bool IsValid(const ByteArray &bytes)
Definition: token_database.h:225
constexpr bool ok() const
Definition: token_database.h:277
Entries Find(uint32_t token) const
Returns all entries associated with this token. This is O(n).
static constexpr TokenDatabase Create()
Definition: token_database.h:240
constexpr iterator end() const
Returns an iterator for one past the last token entry.
Definition: token_database.h:283
An entry in the token database.
Definition: token_database.h:103
const char * string
The null-terminated string represented by this token.
Definition: token_database.h:115
uint32_t date_removed
Definition: token_database.h:112
uint32_t token
The token that represents this string.
Definition: token_database.h:105