Pigweed
 
Loading...
Searching...
No Matches
token_database.h
1// Copyright 2020 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14#pragma once
15
16#include <array>
17#include <cstddef>
18#include <cstdint>
19#include <iterator>
20
21namespace pw::tokenizer {
22
76 private:
77 // Internal struct that describes how the underlying binary token database
78 // stores entries. RawEntries generally should not be used directly. Instead,
79 // use an Entry, which contains a pointer to the entry's string.
80 struct RawEntry {
81 uint32_t token;
82 uint32_t date_removed;
83 };
84
85 static_assert(sizeof(RawEntry) == 8u);
86
87 template <typename T>
88 static constexpr uint32_t ReadUint32(const T* bytes) {
89 return static_cast<uint8_t>(bytes[0]) |
90 static_cast<uint8_t>(bytes[1]) << 8 |
91 static_cast<uint8_t>(bytes[2]) << 16 |
92 static_cast<uint8_t>(bytes[3]) << 24;
93 }
94
95 public:
98 static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF;
99
101 struct Entry {
103 uint32_t token;
104
110 uint32_t date_removed;
111
113 const char* string;
114 };
115
117 class iterator {
118 public:
119 using difference_type = std::ptrdiff_t;
120 using value_type = Entry;
121 using pointer = const Entry*;
122 using reference = const Entry&;
123 using iterator_category = std::forward_iterator_tag;
124
125 constexpr iterator() : entry_{}, raw_(nullptr) {}
126
127 constexpr iterator(const iterator& other) = default;
128 constexpr iterator& operator=(const iterator& other) = default;
129
130 constexpr iterator& operator++() {
131 raw_ += sizeof(RawEntry);
132 ReadRawEntry();
133 // Move string_ to the character beyond the next null terminator.
134 while (*entry_.string++ != '\0') {
135 }
136 return *this;
137 }
138 constexpr iterator operator++(int) {
139 iterator previous(*this);
140 operator++();
141 return previous;
142 }
143 constexpr bool operator==(const iterator& rhs) const {
144 return raw_ == rhs.raw_;
145 }
146 constexpr bool operator!=(const iterator& rhs) const {
147 return raw_ != rhs.raw_;
148 }
149
150 constexpr const Entry& operator*() const { return entry_; }
151
152 constexpr const Entry* operator->() const { return &entry_; }
153
154 constexpr difference_type operator-(const iterator& rhs) const {
155 return (raw_ - rhs.raw_) / sizeof(RawEntry);
156 }
157
158 private:
159 friend class TokenDatabase;
160
161 // Constructs a new iterator to a valid entry.
162 constexpr iterator(const char* raw_entry, const char* string)
163 : entry_{0, 0, string}, raw_{raw_entry} {
164 if (raw_entry != string) { // raw_entry == string if the DB is empty
165 ReadRawEntry();
166 }
167 }
168
169 explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {}
170
171 constexpr void ReadRawEntry() {
172 entry_.token = ReadUint32(raw_);
173 entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token));
174 }
175
176 Entry entry_;
177 const char* raw_;
178 };
179
180 using value_type = Entry;
181 using size_type = std::size_t;
182 using difference_type = std::ptrdiff_t;
183 using reference = value_type&;
184 using const_reference = const value_type&;
185 using pointer = const value_type*;
186 using const_pointer = const value_type*;
187 using const_iterator = iterator;
188 using reverse_iterator = std::reverse_iterator<iterator>;
189 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
190
193 class Entries {
194 public:
195 constexpr Entries(const iterator& begin, const iterator& end)
196 : begin_(begin), end_(end) {}
197
198 // The number of entries in this list.
199 constexpr size_type size() const { return end_ - begin_; }
200
201 // True of the list is empty.
202 constexpr bool empty() const { return begin_ == end_; }
203
204 // Accesses the specified entry in this set. The index must be less than
205 // size(). This operation is O(n) in size().
206 Entry operator[](size_type index) const;
207
208 constexpr const iterator& begin() const { return begin_; }
209 constexpr const iterator& end() const { return end_; }
210
211 private:
212 iterator begin_;
213 iterator end_;
214 };
215
220 template <typename ByteArray>
221 static constexpr bool IsValid(const ByteArray& bytes) {
222 return HasValidHeader(bytes) && EachEntryHasAString(bytes);
223 }
224
235 template <const auto& kDatabaseBytes>
236 static constexpr TokenDatabase Create() {
237 static_assert(
238 HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
239 "Databases must start with a 16-byte header that begins with TOKENS.");
240
241 static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
242 "The database must have at least one string for each entry.");
243
244 return TokenDatabase(std::data(kDatabaseBytes));
245 }
246
254 template <typename ByteArray>
255 static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
256 return IsValid<ByteArray>(database_bytes)
257 ? TokenDatabase(std::data(database_bytes))
258 : TokenDatabase(); // Invalid database.
259 }
261 constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
262
264 Entries Find(uint32_t token) const;
265
267 constexpr size_type size() const {
268 return (end_.data - begin_.data) / sizeof(RawEntry);
269 }
270
273 constexpr bool ok() const { return begin_.data != nullptr; }
274
276 constexpr iterator begin() const { return iterator(begin_.data, end_.data); }
277
279 constexpr iterator end() const { return iterator(end_.data); }
280
281 private:
282 struct Header {
283 std::array<char, 6> magic;
284 uint16_t version;
285 uint32_t entry_count;
286 uint32_t reserved;
287 };
288
289 static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
290
291 template <typename ByteArray>
292 static constexpr bool HasValidHeader(const ByteArray& bytes) {
293 static_assert(sizeof(*std::data(bytes)) == 1u);
294
295 if (std::size(bytes) < sizeof(Header)) {
296 return false;
297 }
298
299 // Check the magic number and version.
300 for (size_type i = 0; i < kMagicAndVersion.size(); ++i) {
301 if (bytes[i] != kMagicAndVersion[i]) {
302 return false;
303 }
304 }
305
306 return true;
307 }
308
309 template <typename ByteArray>
310 static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
311 const size_type entries = ReadEntryCount(std::data(bytes));
312
313 // Check that the data is large enough to have a string table.
314 if (std::size(bytes) < StringTable(entries)) {
315 return false;
316 }
317
318 // Count the strings in the string table.
319 size_type string_count = 0;
320 for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
321 ++i) {
322 string_count += (*i == '\0') ? 1 : 0;
323 }
324
325 // Check that there is at least one string for each entry.
326 return string_count >= entries;
327 }
328
329 // Reads the number of entries from a database header. Cast to the bytes to
330 // uint8_t to avoid sign extension if T is signed.
331 template <typename T>
332 static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
333 const T* bytes = header_bytes + offsetof(Header, entry_count);
334 return ReadUint32(bytes);
335 }
336
337 // Calculates the offset of the string table.
338 static constexpr size_type StringTable(size_type entries) {
339 return sizeof(Header) + entries * sizeof(RawEntry);
340 }
341
342 // The magic number that starts the table is "TOKENS". The version is encoded
343 // next as two bytes.
344 static constexpr std::array<char, 8> kMagicAndVersion = {
345 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
346
347 template <typename Byte>
348 constexpr TokenDatabase(const Byte bytes[])
349 : TokenDatabase(bytes + sizeof(Header),
350 bytes + StringTable(ReadEntryCount(bytes))) {
351 static_assert(sizeof(Byte) == 1u);
352 }
353
354 // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
355 // use unions. Instead of using a reinterpret_cast to change the byte pointer
356 // to a RawEntry pointer, have a separate overload for each byte pointer type
357 // and store them in a union.
358 constexpr TokenDatabase(const char* begin, const char* end)
359 : begin_{.data = begin}, end_{.data = end} {}
360
361 constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
362 : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
363
364 constexpr TokenDatabase(const signed char* begin, const signed char* end)
365 : begin_{.signed_data = begin}, end_{.signed_data = end} {}
366
367 // Store the beginning and end pointers as a union to avoid breaking constexpr
368 // rules for reinterpret_cast.
369 union {
370 const char* data;
371 const unsigned char* unsigned_data;
372 const signed char* signed_data;
373 } begin_, end_;
374};
375
376} // namespace pw::tokenizer
Definition: token_database.h:193
Iterator for TokenDatabase values.
Definition: token_database.h:117
Definition: token_database.h:75
static constexpr uint32_t kDateRemovedNever
Definition: token_database.h:98
constexpr TokenDatabase()
Creates a database with no data. ok() returns false.
Definition: token_database.h:261
static constexpr TokenDatabase Create(const ByteArray &database_bytes)
Definition: token_database.h:255
constexpr iterator begin() const
Returns an iterator for the first token entry.
Definition: token_database.h:276
constexpr size_type size() const
Returns the total number of entries (unique token-string pairs).
Definition: token_database.h:267
static constexpr bool IsValid(const ByteArray &bytes)
Definition: token_database.h:221
constexpr bool ok() const
Definition: token_database.h:273
Entries Find(uint32_t token) const
Returns all entries associated with this token. This is O(n).
static constexpr TokenDatabase Create()
Definition: token_database.h:236
constexpr iterator end() const
Returns an iterator for one past the last token entry.
Definition: token_database.h:279
An entry in the token database.
Definition: token_database.h:101
const char * string
The null-terminated string represented by this token.
Definition: token_database.h:113
uint32_t date_removed
Definition: token_database.h:110
uint32_t token
The token that represents this string.
Definition: token_database.h:103