C/C++ API Reference
Loading...
Searching...
No Matches
utf_codecs.h
1// Copyright 2024 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15#pragma once
16
17#include <array>
18#include <cstdint>
19#include <string_view>
20
21#include "pw_result/result.h"
22#include "pw_status/status.h"
24
25namespace pw {
26
28namespace utf {
29
31
37constexpr inline bool IsValidCodepoint(uint32_t code_point) {
38 return code_point < 0xD800u ||
39 (code_point >= 0xE000u && code_point <= 0x10FFFFu);
40}
41
46constexpr inline bool IsValidCharacter(uint32_t code_point) {
47 return code_point < 0xD800u ||
48 (code_point >= 0xE000u && code_point < 0xFDD0u) ||
49 (code_point > 0xFDEFu && code_point <= 0x10FFFFu &&
50 (code_point & 0xFFFEu) != 0xFFFEu);
51}
52
60class CodePointAndSize final {
61 public:
63 explicit constexpr CodePointAndSize(uint32_t code_point, size_t size)
64 : code_point_((static_cast<uint32_t>(size) << kSizeShift) | code_point) {}
65
66 constexpr CodePointAndSize(const CodePointAndSize&) = default;
67 constexpr CodePointAndSize& operator=(const CodePointAndSize&) = default;
68 constexpr CodePointAndSize(CodePointAndSize&&) = default;
69 constexpr CodePointAndSize& operator=(CodePointAndSize&&) = default;
70
72 constexpr uint32_t code_point() const { return code_point_ & kCodePointMask; }
73
75 constexpr size_t size() const {
76 return (code_point_ & kSizeMask) >> kSizeShift;
77 }
78
79 private:
80 static constexpr size_t kSizeBits = 4;
81 static constexpr uint32_t kCodePointMask = ~0U >> kSizeBits;
82 static constexpr uint32_t kSizeMask = ~kCodePointMask;
83 static constexpr size_t kSizeShift = sizeof(uint32_t) * 8 - kSizeBits;
84 uint32_t code_point_;
85};
86
88
89} // namespace utf
90
91namespace utf8 {
92
94
114 std::string_view str) {
115 if (str.empty()) {
117 }
118
119 const uint8_t leading_byte = static_cast<uint8_t>(str.front());
120 size_t byte_count = 0;
121 uint32_t code_point = 0xFFFFFFFFu;
122
123 if (leading_byte <= 0x7F) {
124 byte_count = 1;
125 // b0xxx xxxx
126 code_point = leading_byte;
127 } else if (leading_byte <= 0xDF) {
128 byte_count = 2;
129 if (str.size() < byte_count) {
131 }
132 // b110x xxxx 10xx xxxx
133 if ((str[1] & 0xC0) != 0x80) {
134 // Invalid continuation
136 }
137 code_point = (static_cast<uint32_t>(str[0] & 0x1F) << 6) +
138 static_cast<uint32_t>(str[1] & 0x3F);
139 } else if (leading_byte <= 0xEF) {
140 byte_count = 3;
141 if (str.size() < byte_count) {
143 }
144 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80) {
145 // Invalid continuation
147 }
148 // b1110 xxxx 10xx xxxx 10xx xxxx
149 code_point = (static_cast<uint32_t>(str[0] & 0x0F) << 12) +
150 (static_cast<uint32_t>(str[1] & 0x3F) << 6) +
151 static_cast<uint32_t>(str[2] & 0x3F);
152 } else if (leading_byte <= 0xF7) {
153 byte_count = 4;
154 if (str.size() < byte_count) {
156 }
157 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80 ||
158 (str[3] & 0xC0) != 0x80) {
159 // Invalid continuation
161 }
162 // b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
163 code_point = (static_cast<uint32_t>(str[0] & 0x07) << 18) +
164 (static_cast<uint32_t>(str[1] & 0x3F) << 12) +
165 (static_cast<uint32_t>(str[2] & 0x3F) << 6) +
166 static_cast<uint32_t>(str[3] & 0x3F);
167 } else {
169 }
170
171 // Validate the decoded value.
172 if (utf::IsValidCodepoint(code_point)) {
173 return utf::CodePointAndSize(code_point, byte_count);
174 }
175
176 return pw::Status::OutOfRange();
177}
178
180constexpr bool IsStringValid(std::string_view str) {
181 while (!str.empty()) {
182 auto rslt = utf8::ReadCodePoint(str);
183 if (!rslt.ok() || !utf::IsValidCharacter(rslt->code_point())) {
184 return false;
185 }
186 str = str.substr(rslt->size());
187 }
188 return true;
189}
190
193 public:
194 constexpr EncodedCodePoint(uint32_t size, std::array<char, 4> data)
195 : size_(size), data_(std::move(data)) {}
196 constexpr EncodedCodePoint(EncodedCodePoint&& encoded) = default;
197 constexpr std::string_view as_view() const { return {data_.data(), size_}; }
198
199 private:
200 uint32_t size_;
201 std::array<char, 4> data_;
202};
203
230constexpr Result<EncodedCodePoint> EncodeCodePoint(uint32_t code_point) {
231 if (code_point <= 0x7F) {
232 return EncodedCodePoint{1, {static_cast<char>(code_point)}};
233 }
234 if (code_point <= 0x7FF) {
235 return EncodedCodePoint{2,
236 {static_cast<char>(0xC0 | (code_point >> 6)),
237 static_cast<char>(0x80 | (code_point & 0x3F))}};
238 }
239 if (code_point <= 0xFFFF) {
240 return EncodedCodePoint{
241 3,
242 {static_cast<char>(0xE0 | (code_point >> 12)),
243 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
244 static_cast<char>(0x80 | (code_point & 0x3F))}};
245 }
246 if (code_point <= 0x10FFFF) {
247 return EncodedCodePoint{
248 4,
249 {static_cast<char>(0xF0 | (code_point >> 18)),
250 static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)),
251 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
252 static_cast<char>(0x80 | (code_point & 0x3F))}};
253 }
254
255 return pw::Status::OutOfRange();
256}
257
259Status WriteCodePoint(uint32_t code_point, pw::StringBuilder& output);
260
262
263} // namespace utf8
264
265} // namespace pw
Definition: poll.h:25
Definition: status.h:109
static constexpr Status InvalidArgument()
Argument was malformed; e.g. invalid characters when parsing integer.
Definition: status.h:131
static constexpr Status OutOfRange()
Operation attempted out of range; e.g. seeking past end of file.
Definition: status.h:172
Definition: string_builder.h:89
Encapsulates the result of encoding a single code point as UTF-8.
Definition: utf_codecs.h:192
Definition: utf_codecs.h:60
constexpr uint32_t code_point() const
Returns the code point this represents.
Definition: utf_codecs.h:72
constexpr size_t size() const
Returns the number of bytes required to encode this codepoint.
Definition: utf_codecs.h:75
constexpr CodePointAndSize(uint32_t code_point, size_t size)
Creates a combined view of a @code_point and its encoded @size.
Definition: utf_codecs.h:63
constexpr Result< EncodedCodePoint > EncodeCodePoint(uint32_t code_point)
Encodes a single code point as UTF-8.
Definition: utf_codecs.h:230
constexpr bool IsStringValid(std::string_view str)
Determines if str is a valid UTF-8 string.
Definition: utf_codecs.h:180
constexpr bool IsValidCodepoint(uint32_t code_point)
Definition: utf_codecs.h:37
constexpr pw::Result< utf::CodePointAndSize > ReadCodePoint(std::string_view str)
Reads the first code point from a UTF-8 encoded str.
Definition: utf_codecs.h:113
constexpr bool IsValidCharacter(uint32_t code_point)
Definition: utf_codecs.h:46
Status WriteCodePoint(uint32_t code_point, pw::StringBuilder &output)
Helper that writes a code point to the provided pw::StringBuilder.
The Pigweed namespace.
Definition: alignment.h:27
pw::StringBuilder facilitates creating formatted strings in a fixed-sized buffer or in a pw::InlineSt...