Pigweed
 
Loading...
Searching...
No Matches
utf_codecs.h
1// Copyright 2024 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15#pragma once
16
18
19#include <array>
20#include <cstdint>
21#include <string_view>
22
23#include "pw_result/result.h"
24#include "pw_status/status.h"
26
27namespace pw {
28namespace utf {
34constexpr inline bool IsValidCodepoint(uint32_t code_point) {
35 return code_point < 0xD800u ||
36 (code_point >= 0xE000u && code_point <= 0x10FFFFu);
37}
38
43constexpr inline bool IsValidCharacter(uint32_t code_point) {
44 return code_point < 0xD800u ||
45 (code_point >= 0xE000u && code_point < 0xFDD0u) ||
46 (code_point > 0xFDEFu && code_point <= 0x10FFFFu &&
47 (code_point & 0xFFFEu) != 0xFFFEu);
48}
49
57class CodePointAndSize final {
58 public:
60 explicit constexpr CodePointAndSize(uint32_t code_point, size_t size)
61 : code_point_((static_cast<uint32_t>(size) << kSizeShift) | code_point) {}
62
63 constexpr CodePointAndSize(const CodePointAndSize&) = default;
64 constexpr CodePointAndSize& operator=(const CodePointAndSize&) = default;
65 constexpr CodePointAndSize(CodePointAndSize&&) = default;
66 constexpr CodePointAndSize& operator=(CodePointAndSize&&) = default;
67
69 constexpr uint32_t code_point() const { return code_point_ & kCodePointMask; }
70
72 constexpr size_t size() const {
73 return (code_point_ & kSizeMask) >> kSizeShift;
74 }
75
76 private:
77 static constexpr size_t kSizeBits = 4;
78 static constexpr uint32_t kCodePointMask = ~0U >> kSizeBits;
79 static constexpr uint32_t kSizeMask = ~kCodePointMask;
80 static constexpr size_t kSizeShift = sizeof(uint32_t) * 8 - kSizeBits;
81 uint32_t code_point_;
82};
83} // namespace utf
84
85namespace utf8 {
104constexpr pw::Result<utf::CodePointAndSize> ReadCodePoint(
105 std::string_view str) {
106 if (str.empty()) {
107 return pw::Status::InvalidArgument();
108 }
109
110 const uint8_t leading_byte = static_cast<uint8_t>(str.front());
111 size_t byte_count = 0;
112 uint32_t code_point = 0xFFFFFFFFu;
113
114 if (leading_byte <= 0x7F) {
115 byte_count = 1;
116 // b0xxx xxxx
117 code_point = leading_byte;
118 } else if (leading_byte <= 0xDF) {
119 byte_count = 2;
120 if (str.size() < byte_count) {
121 return pw::Status::InvalidArgument();
122 }
123 // b110x xxxx 10xx xxxx
124 if ((str[1] & 0xC0) != 0x80) {
125 // Invalid continuation
126 return pw::Status::InvalidArgument();
127 }
128 code_point = (static_cast<uint32_t>(str[0] & 0x1F) << 6) +
129 static_cast<uint32_t>(str[1] & 0x3F);
130 } else if (leading_byte <= 0xEF) {
131 byte_count = 3;
132 if (str.size() < byte_count) {
133 return pw::Status::InvalidArgument();
134 }
135 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80) {
136 // Invalid continuation
137 return pw::Status::InvalidArgument();
138 }
139 // b1110 xxxx 10xx xxxx 10xx xxxx
140 code_point = (static_cast<uint32_t>(str[0] & 0x0F) << 12) +
141 (static_cast<uint32_t>(str[1] & 0x3F) << 6) +
142 static_cast<uint32_t>(str[2] & 0x3F);
143 } else if (leading_byte <= 0xF7) {
144 byte_count = 4;
145 if (str.size() < byte_count) {
146 return pw::Status::InvalidArgument();
147 }
148 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80 ||
149 (str[3] & 0xC0) != 0x80) {
150 // Invalid continuation
151 return pw::Status::InvalidArgument();
152 }
153 // b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
154 code_point = (static_cast<uint32_t>(str[0] & 0x07) << 18) +
155 (static_cast<uint32_t>(str[1] & 0x3F) << 12) +
156 (static_cast<uint32_t>(str[2] & 0x3F) << 6) +
157 static_cast<uint32_t>(str[3] & 0x3F);
158 } else {
159 return pw::Status::InvalidArgument();
160 }
161
162 // Validate the decoded value.
163 if (utf::IsValidCodepoint(code_point)) {
164 return utf::CodePointAndSize(code_point, byte_count);
165 }
166
167 return pw::Status::OutOfRange();
168}
169
171constexpr bool IsStringValid(std::string_view str) {
172 while (!str.empty()) {
173 auto rslt = utf8::ReadCodePoint(str);
174 if (!rslt.ok() || !utf::IsValidCharacter(rslt->code_point())) {
175 return false;
176 }
177 str = str.substr(rslt->size());
178 }
179 return true;
180}
181
184 public:
185 constexpr EncodedCodePoint(uint32_t size, std::array<char, 4> data)
186 : size_(size), data_(std::move(data)) {}
187 constexpr EncodedCodePoint(EncodedCodePoint&& encoded) = default;
188 constexpr std::string_view as_view() const { return {data_.data(), size_}; }
189
190 private:
191 uint32_t size_;
192 std::array<char, 4> data_;
193};
194
221constexpr Result<EncodedCodePoint> EncodeCodePoint(uint32_t code_point) {
222 if (code_point <= 0x7F) {
223 return EncodedCodePoint{1, {static_cast<char>(code_point)}};
224 }
225 if (code_point <= 0x7FF) {
226 return EncodedCodePoint{2,
227 {static_cast<char>(0xC0 | (code_point >> 6)),
228 static_cast<char>(0x80 | (code_point & 0x3F))}};
229 }
230 if (code_point <= 0xFFFF) {
231 return EncodedCodePoint{
232 3,
233 {static_cast<char>(0xE0 | (code_point >> 12)),
234 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
235 static_cast<char>(0x80 | (code_point & 0x3F))}};
236 }
237 if (code_point <= 0x10FFFF) {
238 return EncodedCodePoint{
239 4,
240 {static_cast<char>(0xF0 | (code_point >> 18)),
241 static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)),
242 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
243 static_cast<char>(0x80 | (code_point & 0x3F))}};
244 }
245
246 return pw::Status::OutOfRange();
247}
248
250Status WriteCodePoint(uint32_t code_point, pw::StringBuilder& output);
251
252} // namespace utf8
253
254} // namespace pw
Definition: string_builder.h:87
Encapsulates the result of encoding a single code point as UTF-8.
Definition: utf_codecs.h:183
Definition: utf_codecs.h:57
constexpr uint32_t code_point() const
Returns the code point this represents.
Definition: utf_codecs.h:69
constexpr size_t size() const
Returns the number of bytes required to encode this codepoint.
Definition: utf_codecs.h:72
constexpr CodePointAndSize(uint32_t code_point, size_t size)
Creates a combined view of a @code_point and its encoded @size.
Definition: utf_codecs.h:60
Provides basic helpers for reading and writing UTF-8 encoded strings.
Definition: alignment.h:27
pw::StringBuilder facilitates creating formatted strings in a fixed-sized buffer or in a pw::InlineSt...