Skip to main content

pw_tokenizer_core/
pw_tokenizer_core.rs

1// Copyright 2023 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7//     https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15//! # pw_tokenizer_core
16//!
17//! This crate provides the core functionality of calculating a token hash
18//! for a string or byte sequence.  This is intended to provide a minimal core
19//! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates.  Users
20//! should prefer depending `pw_tokenizer` instead of this crate.
21#![cfg_attr(not(feature = "std"), no_std)]
22
23pub const HASH_CONSTANT: u32 = 65599u32;
24
25/// Hasher is used to calculate a token's hash value.
26///
27/// The hash state is encapsulated in the `Hasher` to allow the hashing of
28/// multi-part strings where the concatenated value can not be know at macro
29/// expansion time.  All functions are `const`.
30pub struct Hasher {
31    coefficient: u32,
32    hash: u32,
33    bytes_hashed: usize,
34    hash_len: usize,
35}
36
37impl Hasher {
38    /// Create a new `Hasher`
39    ///
40    /// `data_len` is the total length of the data to be hashed.  `hash_len`
41    /// is the number of bytes of data to be used in calculating the hash.
42    /// `data_len` is used to seed  the hash while `hash_len` controls how many
43    /// bytes are hashed.
44    #[must_use]
45    // TODO: https://pwbug.dev/524779003 - How does the C++ version hash a 4GB token on a 64bit target?
46    #[allow(clippy::cast_possible_truncation)]
47    pub const fn new(data_len: usize, hash_len: usize) -> Self {
48        {
49            Self {
50                coefficient: HASH_CONSTANT,
51                hash: data_len as u32,
52                bytes_hashed: 0,
53                hash_len,
54            }
55        }
56    }
57
58    /// Processes `bytes` and updates hash state.
59    ///
60    /// Consumes `self` and returns a [`Hasher`] with the updated state.
61    #[must_use]
62    pub const fn process_bytes(mut self, bytes: &[u8]) -> Self {
63        let bytes_left = self.hash_len - self.bytes_hashed;
64
65        let bytes = if bytes.len() > bytes_left {
66            bytes.split_at(bytes_left).0
67        } else {
68            bytes
69        };
70
71        // For loops are not allowed in const functions.
72        let mut i = 0;
73        while i < bytes.len() {
74            // We call `u32::wrapping_*` instead of using `core::num::Wrapping` to
75            // avoid using traits in a const function.
76            self.hash = self
77                .hash
78                .wrapping_add(self.coefficient.wrapping_mul(bytes[i] as u32));
79            self.coefficient = self.coefficient.wrapping_mul(HASH_CONSTANT);
80            i += 1;
81        }
82        self.bytes_hashed += i;
83
84        self
85    }
86
87    /// Consume `self` and return the hash.
88    #[must_use]
89    pub const fn hash(self) -> u32 {
90        self.hash
91    }
92}
93/// Calculate the hash for a sequence of bytes.
94///
95/// ```
96/// use pw_tokenizer_core::hash_bytes;
97///
98/// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
99/// assert_eq!(hash, 0x9e624642);
100/// ```
101#[must_use]
102pub const fn hash_bytes(bytes: &[u8]) -> u32 {
103    hash_bytes_fixed(bytes, bytes.len())
104}
105
106/// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
107///
108/// ```
109/// use pw_tokenizer_core::hash_bytes_fixed;
110///
111/// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
112/// assert_eq!(hash, 0x92c5d2ac);
113/// ```
114#[must_use]
115pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
116    Hasher::new(bytes.len(), len).process_bytes(bytes).hash()
117}
118
119/// Calculate the hash for a string.
120///
121/// ```
122/// use pw_tokenizer_core::hash_string;
123///
124/// let hash = hash_string("I 💖 Pigweed");
125/// assert_eq!(hash, 0xe318d1b3);
126/// ```
127#[must_use]
128pub const fn hash_string(s: &str) -> u32 {
129    hash_bytes(s.as_bytes())
130}
131
132pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xbaa98dee;
133
134#[cfg(test)]
135mod tests {
136    use super::{hash_bytes_fixed, Hasher};
137
138    struct TestCase {
139        string: &'static [u8],
140        hash_length: usize,
141        hash: u32,
142    }
143
144    // Generated file defines `test_cases()`.
145    include!("pw_tokenizer_core_test_cases.rs");
146
147    #[test]
148    fn hash_bytes_fixed_generates_correct_hash() {
149        for test in test_cases() {
150            let hash = hash_bytes_fixed(test.string, test.hash_length);
151            assert_eq!(
152                hash, test.hash,
153                "{:08x} != {:08x} string: {:x?} hash_size: {}",
154                hash, test.hash, test.string, test.hash_length
155            );
156        }
157    }
158
159    #[test]
160    fn multi_part_data_generates_correct_hash() {
161        for test in test_cases() {
162            let mut hasher = Hasher::new(test.string.len(), test.hash_length);
163            // Break each test string into 8 byte chunks and feed them to the
164            // hasher separately.
165            for chunk in test.string.chunks(8) {
166                hasher = hasher.process_bytes(chunk);
167            }
168            let hash = hasher.hash();
169            assert_eq!(
170                hash, test.hash,
171                "{:08x} != {:08x} string: {:x?} hash_size: {}",
172                hash, test.hash, test.string, test.hash_length
173            );
174        }
175    }
176}