pw_tokenizer_core/pw_tokenizer_core.rs
1// Copyright 2023 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15//! # pw_tokenizer_core
16//!
17//! This crate provides the core functionality of calculating a token hash
18//! for a string or byte sequence. This is intended to provide a minimal core
19//! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates. Users
20//! should prefer depending `pw_tokenizer` instead of this crate.
21#![cfg_attr(not(feature = "std"), no_std)]
22
23pub const HASH_CONSTANT: u32 = 65599u32;
24
25/// Hasher is used to calculate a token's hash value.
26///
27/// The hash state is encapsulated in the `Hasher` to allow the hashing of
28/// multi-part strings where the concatenated value can not be know at macro
29/// expansion time. All functions are `const`.
30pub struct Hasher {
31 coefficient: u32,
32 hash: u32,
33 bytes_hashed: usize,
34 hash_len: usize,
35}
36
37impl Hasher {
38 /// Create a new `Hasher`
39 ///
40 /// `data_len` is the total length of the data to be hashed. `hash_len`
41 /// is the number of bytes of data to be used in calculating the hash.
42 /// `data_len` is used to seed the hash while `hash_len` controls how many
43 /// bytes are hashed.
44 pub const fn new(data_len: usize, hash_len: usize) -> Self {
45 {
46 Self {
47 coefficient: HASH_CONSTANT,
48 hash: data_len as u32,
49 bytes_hashed: 0,
50 hash_len,
51 }
52 }
53 }
54
55 /// Processes `bytes` and updates hash state.
56 ///
57 /// Consumes `self` and returns a [`Hasher`] with the updated state.
58 pub const fn process_bytes(mut self, bytes: &[u8]) -> Self {
59 let bytes_left = self.hash_len - self.bytes_hashed;
60
61 let bytes = if bytes.len() > bytes_left {
62 bytes.split_at(bytes_left).0
63 } else {
64 bytes
65 };
66
67 // For loops are not allowed in const functions.
68 let mut i = 0;
69 while i < bytes.len() {
70 // We call `u32::wrapping_*` instead of using `core::num::Wrapping` to
71 // avoid using traits in a const function.
72 self.hash = self
73 .hash
74 .wrapping_add(self.coefficient.wrapping_mul(bytes[i] as u32));
75 self.coefficient = self.coefficient.wrapping_mul(HASH_CONSTANT);
76 i += 1;
77 }
78 self.bytes_hashed += i;
79
80 self
81 }
82
83 /// Consume `self` and return the hash.
84 pub const fn hash(self) -> u32 {
85 self.hash
86 }
87}
88/// Calculate the hash for a sequence of bytes.
89///
90/// ```
91/// use pw_tokenizer_core::hash_bytes;
92///
93/// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
94/// assert_eq!(hash, 0x9e624642);
95/// ```
96pub const fn hash_bytes(bytes: &[u8]) -> u32 {
97 hash_bytes_fixed(bytes, bytes.len())
98}
99
100/// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
101///
102/// ```
103/// use pw_tokenizer_core::hash_bytes_fixed;
104///
105/// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
106/// assert_eq!(hash, 0x92c5d2ac);
107/// ```
108pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
109 Hasher::new(bytes.len(), len).process_bytes(bytes).hash()
110}
111
112/// Calculate the hash for a string.
113///
114/// ```
115/// use pw_tokenizer_core::hash_string;
116///
117/// let hash = hash_string("I 💖 Pigweed");
118/// assert_eq!(hash, 0xe318d1b3);
119/// ```
120pub const fn hash_string(s: &str) -> u32 {
121 hash_bytes(s.as_bytes())
122}
123
124pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE;
125
126#[cfg(test)]
127mod tests {
128 use super::{hash_bytes_fixed, Hasher};
129
130 struct TestCase {
131 string: &'static [u8],
132 hash_length: usize,
133 hash: u32,
134 }
135
136 // Generated file defines `test_cases()`.
137 include!("pw_tokenizer_core_test_cases.rs");
138
139 #[test]
140 fn hash_bytes_fixed_generates_correct_hash() {
141 for test in test_cases() {
142 let hash = hash_bytes_fixed(test.string, test.hash_length);
143 assert_eq!(
144 hash, test.hash,
145 "{:08x} != {:08x} string: {:x?} hash_size: {}",
146 hash, test.hash, test.string, test.hash_length
147 );
148 }
149 }
150
151 #[test]
152 fn multi_part_data_generates_correct_hash() {
153 for test in test_cases() {
154 let mut hasher = Hasher::new(test.string.len(), test.hash_length);
155 // Break each test string into 8 byte chunks and feed them to the
156 // hasher separately.
157 for chunk in test.string.chunks(8) {
158 hasher = hasher.process_bytes(chunk);
159 }
160 let hash = hasher.hash();
161 assert_eq!(
162 hash, test.hash,
163 "{:08x} != {:08x} string: {:x?} hash_size: {}",
164 hash, test.hash, test.string, test.hash_length
165 );
166 }
167 }
168}