C/C++ API Reference
Loading...
Searching...
No Matches
utf_codecs.h
1// Copyright 2024 The Pigweed Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4// use this file except in compliance with the License. You may obtain a copy of
5// the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12// License for the specific language governing permissions and limitations under
13// the License.
14
15#pragma once
16
17#include <array>
18#include <cstdint>
19#include <string_view>
20
21#include "pw_result/result.h"
22#include "pw_status/status.h"
24
25namespace pw {
26
28namespace utf {
29
31
37constexpr inline bool IsValidCodepoint(uint32_t code_point) {
38 return code_point < 0xD800u ||
39 (code_point >= 0xE000u && code_point <= 0x10FFFFu);
40}
41
46constexpr inline bool IsValidCharacter(uint32_t code_point) {
47 return code_point < 0xD800u ||
48 (code_point >= 0xE000u && code_point < 0xFDD0u) ||
49 (code_point > 0xFDEFu && code_point <= 0x10FFFFu &&
50 (code_point & 0xFFFEu) != 0xFFFEu);
51}
52
60class CodePointAndSize final {
61 public:
63 explicit constexpr CodePointAndSize(uint32_t code_point, size_t size)
64 : code_point_((static_cast<uint32_t>(size) << kSizeShift) | code_point) {}
65
66 constexpr CodePointAndSize(const CodePointAndSize&) = default;
67 constexpr CodePointAndSize& operator=(const CodePointAndSize&) = default;
68 constexpr CodePointAndSize(CodePointAndSize&&) = default;
69 constexpr CodePointAndSize& operator=(CodePointAndSize&&) = default;
70
72 constexpr uint32_t code_point() const { return code_point_ & kCodePointMask; }
73
75 constexpr size_t size() const {
76 return (code_point_ & kSizeMask) >> kSizeShift;
77 }
78
79 private:
80 static constexpr size_t kSizeBits = 4;
81 static constexpr uint32_t kCodePointMask = ~0U >> kSizeBits;
82 static constexpr uint32_t kSizeMask = ~kCodePointMask;
83 static constexpr size_t kSizeShift = sizeof(uint32_t) * 8 - kSizeBits;
84 uint32_t code_point_;
85};
86
88
89} // namespace utf
90
91namespace utf8 {
92
94
107 std::string_view str) {
108 if (str.empty()) {
110 }
111
112 const uint8_t leading_byte = static_cast<uint8_t>(str.front());
113 size_t byte_count = 0;
114 uint32_t code_point = 0xFFFFFFFFu;
115
116 if (leading_byte <= 0x7F) {
117 byte_count = 1;
118 // b0xxx xxxx
119 code_point = leading_byte;
120 } else if (leading_byte <= 0xDF) {
121 byte_count = 2;
122 if (str.size() < byte_count) {
124 }
125 // b110x xxxx 10xx xxxx
126 if ((str[1] & 0xC0) != 0x80) {
127 // Invalid continuation
129 }
130 code_point = (static_cast<uint32_t>(str[0] & 0x1F) << 6) +
131 static_cast<uint32_t>(str[1] & 0x3F);
132 } else if (leading_byte <= 0xEF) {
133 byte_count = 3;
134 if (str.size() < byte_count) {
136 }
137 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80) {
138 // Invalid continuation
140 }
141 // b1110 xxxx 10xx xxxx 10xx xxxx
142 code_point = (static_cast<uint32_t>(str[0] & 0x0F) << 12) +
143 (static_cast<uint32_t>(str[1] & 0x3F) << 6) +
144 static_cast<uint32_t>(str[2] & 0x3F);
145 } else if (leading_byte <= 0xF7) {
146 byte_count = 4;
147 if (str.size() < byte_count) {
149 }
150 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80 ||
151 (str[3] & 0xC0) != 0x80) {
152 // Invalid continuation
154 }
155 // b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
156 code_point = (static_cast<uint32_t>(str[0] & 0x07) << 18) +
157 (static_cast<uint32_t>(str[1] & 0x3F) << 12) +
158 (static_cast<uint32_t>(str[2] & 0x3F) << 6) +
159 static_cast<uint32_t>(str[3] & 0x3F);
160 } else {
162 }
163
164 // Validate the decoded value.
165 if (utf::IsValidCodepoint(code_point)) {
166 return utf::CodePointAndSize(code_point, byte_count);
167 }
168
169 return pw::Status::OutOfRange();
170}
171
173constexpr bool IsStringValid(std::string_view str) {
174 while (!str.empty()) {
175 auto rslt = utf8::ReadCodePoint(str);
176 if (!rslt.ok() || !utf::IsValidCharacter(rslt->code_point())) {
177 return false;
178 }
179 str = str.substr(rslt->size());
180 }
181 return true;
182}
183
186 public:
187 constexpr EncodedCodePoint(uint32_t size, std::array<char, 4> data)
188 : size_(size), data_(std::move(data)) {}
189 constexpr EncodedCodePoint(EncodedCodePoint&& encoded) = default;
190 constexpr std::string_view as_view() const { return {data_.data(), size_}; }
191
192 private:
193 uint32_t size_;
194 std::array<char, 4> data_;
195};
196
216constexpr Result<EncodedCodePoint> EncodeCodePoint(uint32_t code_point) {
217 if (code_point <= 0x7F) {
218 return EncodedCodePoint{1, {static_cast<char>(code_point)}};
219 }
220 if (code_point <= 0x7FF) {
221 return EncodedCodePoint{2,
222 {static_cast<char>(0xC0 | (code_point >> 6)),
223 static_cast<char>(0x80 | (code_point & 0x3F))}};
224 }
225 if (code_point <= 0xFFFF) {
226 return EncodedCodePoint{
227 3,
228 {static_cast<char>(0xE0 | (code_point >> 12)),
229 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
230 static_cast<char>(0x80 | (code_point & 0x3F))}};
231 }
232 if (code_point <= 0x10FFFF) {
233 return EncodedCodePoint{
234 4,
235 {static_cast<char>(0xF0 | (code_point >> 18)),
236 static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)),
237 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
238 static_cast<char>(0x80 | (code_point & 0x3F))}};
239 }
240
241 return pw::Status::OutOfRange();
242}
243
245Status WriteCodePoint(uint32_t code_point, pw::StringBuilder& output);
246
248
249} // namespace utf8
250
251} // namespace pw
Definition: result.h:143
Definition: status.h:120
static constexpr Status InvalidArgument()
Definition: status.h:164
static constexpr Status OutOfRange()
Definition: status.h:267
Definition: string_builder.h:89
Encapsulates the result of encoding a single code point as UTF-8.
Definition: utf_codecs.h:185
Definition: utf_codecs.h:60
constexpr uint32_t code_point() const
Returns the code point this represents.
Definition: utf_codecs.h:72
constexpr size_t size() const
Returns the number of bytes required to encode this codepoint.
Definition: utf_codecs.h:75
constexpr CodePointAndSize(uint32_t code_point, size_t size)
Creates a combined view of a @code_point and its encoded @size.
Definition: utf_codecs.h:63
constexpr Result< EncodedCodePoint > EncodeCodePoint(uint32_t code_point)
Encodes a single code point as UTF-8.
Definition: utf_codecs.h:216
constexpr bool IsStringValid(std::string_view str)
Determines if str is a valid UTF-8 string.
Definition: utf_codecs.h:173
constexpr bool IsValidCodepoint(uint32_t code_point)
Definition: utf_codecs.h:37
constexpr pw::Result< utf::CodePointAndSize > ReadCodePoint(std::string_view str)
Reads the first code point from a UTF-8 encoded str.
Definition: utf_codecs.h:106
constexpr bool IsValidCharacter(uint32_t code_point)
Definition: utf_codecs.h:46
Status WriteCodePoint(uint32_t code_point, pw::StringBuilder &output)
Helper that writes a code point to the provided pw::StringBuilder.
The Pigweed namespace.
Definition: alignment.h:27
pw::StringBuilder facilitates creating formatted strings in a fixed-sized buffer or in a pw::InlineSt...