libpqxx
The C++ client library for PostgreSQL
encodings.hxx
Go to the documentation of this file.
1 
9 #ifndef PQXX_INTERNAL_ENCODINGS_HXX
10 #define PQXX_INTERNAL_ENCODINGS_HXX
11 
12 #include <cassert>
13 #include <iomanip>
14 #include <string>
15 #include <string_view>
16 
17 #include "pqxx/encoding_group.hxx"
18 #include "pqxx/strconv.hxx"
19 
20 
21 namespace pqxx::internal
22 {
24 PQXX_PURE char const *name_encoding(int encoding_id) noexcept;
25 
28 enc_group(int /* libpq encoding ID */, sl);
29 
30 
32 
33 PQXX_PURE PQXX_INLINE_ONLY PQXX_HOT constexpr inline unsigned char
34 get_byte(std::string_view buffer, std::size_t offset) noexcept
35 {
36  assert(offset < std::size(buffer));
37  return static_cast<unsigned char>(buffer[offset]);
38 }
39 
40 
42 
50  char const *encoding_name, std::string_view buffer, std::size_t start,
51  std::size_t count, sl loc);
52 
53 
55 
63 [[noreturn]] PQXX_COLD PQXX_LIBEXPORT PQXX_ZARGS void
65  char const *encoding_name, std::string_view buffer, std::size_t start,
66  sl loc);
67 
68 
70 
71 PQXX_PURE PQXX_INLINE_ONLY PQXX_HOT constexpr inline bool
72 between_inc(unsigned char value, unsigned bottom, unsigned top) noexcept
73 {
74  return value >= bottom and value <= top;
75 }
76 
77 
79 
83 template<encoding_group> struct glyph_scanner final
84 {
86 
88  static constexpr inline std::size_t
89  call(std::string_view, std::size_t start, sl);
90 };
91 
92 
94 
100 template<encoding_group ENC, char... NEEDLE>
101 PQXX_INLINE_COV PQXX_HOT inline constexpr std::size_t
102 find_ascii_char(std::string_view haystack, std::size_t here, sl loc)
103 {
104  // We only know how to search for ASCII characters. It's an optimisation
105  // assumption in the code below.
106  static_assert((... and ((NEEDLE & 0x80) == 0)));
107 
108  auto const sz{std::size(haystack)};
109  char const *PQXX_RESTRICT const data{std::data(haystack)};
110  while (here < sz)
111  {
112  // Look up the next character boundary. This can be quite costly, so we
113  // desperately want the call inlined.
114  auto next{glyph_scanner<ENC>::call(haystack, here, loc)};
115  PQXX_ASSUME(next > here);
116 
117  // (For some reason gcc had a problem with a right-fold here. But clang
118  // was fine.)
119  //
120  // In all supported encodings, if a character's first byte is in the ASCII
121  // range, that means it's a single-byte character. It follows that when we
122  // find a match at a position that's the beginning of a character, we do
123  // not need to check that we're in a single-byte character. We are.
124  //
125  // So, we only ever need to check each character's first byte, and if it
126  // doesn't match, move on to the next character.
127  //
128  // As an optimisation for "ASCII-safe" encodings however, we just check
129  // every byte in the text. It's going to be faster than finding character
130  // boundaries first. In these encodings, a multichar byte never contains
131  // any bytes in the ASCII range at all.
132  if ((... or (data[here] == NEEDLE)))
133  return here;
134 
135  // Nope, no hit. Move on.
136  here = next;
137  }
138  return sz;
139 }
140 
141 
143 
154 template<> struct glyph_scanner<encoding_group::ascii_safe> final
155 {
156  PQXX_INLINE_ONLY PQXX_PURE PQXX_HOT static constexpr std::size_t
157  call(std::string_view, std::size_t start, sl) noexcept
158  {
159  return start + 1;
160  }
161 };
162 
163 
165 
178 template<> struct glyph_scanner<encoding_group::two_tier> final
179 {
180  PQXX_INLINE_ONLY static constexpr std::size_t
181  call(std::string_view buffer, std::size_t start, sl loc)
182  {
183  auto const byte1{get_byte(buffer, start)};
184  if (byte1 < 0x80)
185  {
186  // Single-byte ASCII subset.
187  return start + 1;
188  }
189  else if (start + 2 <= std::size(buffer))
190  {
191  // Two-byte character. Not all combinations are valid, but that's not
192  // our concern. All that matters to libpqxx is that it not mistake an
193  // ASCII-like value in the second byte for a special character, or vice
194  // versa.
195  return start + 2;
196  }
197  else
198  {
199  // We do need to ensure that the string does not end in the middle of
200  // a character, or an attacker could "steal" a special ASCII character
201  // that comes directly after the end of the input, and escape the bounds
202  // of the text that way.
203  [[unlikely]] throw_for_truncated_character(
204  "variable-width two-byte encoding", buffer, start, loc);
205  }
206  }
207 };
208 
209 
211 
215 template<> struct glyph_scanner<encoding_group::gb18030> final
216 {
217  PQXX_INLINE_ONLY static constexpr std::size_t
218  call(std::string_view buffer, std::size_t start, sl loc)
219  {
220  auto const byte1{get_byte(buffer, start)};
221  if (byte1 < 0x80)
222  return start + 1;
223  auto const sz{std::size(buffer)};
224  if (byte1 == 0x80)
225  throw_for_encoding_error("GB18030", buffer, start, sz - start, loc);
226 
227  if (start + 2 > sz) [[unlikely]]
228  throw_for_truncated_character("GB18030", buffer, start, loc);
229 
230  auto const byte2{get_byte(buffer, start + 1)};
231  if (between_inc(byte2, 0x40, 0xfe))
232  {
233  if (byte2 == 0x7f) [[unlikely]]
234  throw_for_encoding_error("GB18030", buffer, start, 2, loc);
235 
236  return start + 2;
237  }
238 
239  if (start + 4 > sz) [[unlikely]]
240  throw_for_truncated_character("GB18030", buffer, start, loc);
241 
242  if (
243  between_inc(byte2, 0x30, 0x39) and
244  between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and
245  between_inc(get_byte(buffer, start + 3), 0x30, 0x39))
246  return start + 4;
247 
248  [[unlikely]] throw_for_encoding_error("GB18030", buffer, start, 4, loc);
249  }
250 };
251 
252 
254 
260 template<> struct glyph_scanner<encoding_group::sjis> final
261 {
262  PQXX_INLINE_ONLY static constexpr std::size_t
263  call(std::string_view buffer, std::size_t start, sl loc)
264  {
265  auto const byte1{get_byte(buffer, start)};
266  if (byte1 < 0x80)
267  // ASCII subset (though some characters changed).
268  return start + 1;
269  if (between_inc(byte1, 0xa1, 0xdf))
270  // Katakana, also single-byte characters.
271  return start + 1;
272 
273  // We're a bit strict at checking the first byte, because this is a
274  // relatively complex encoding. We don't want to get fooled by some
275  // extension we don't know about. An error and a user complaint is still
276  // better than a lurking bug.
277  if (
278  not between_inc(byte1, 0x81, 0x9f) and
279  not between_inc(byte1, 0xe0, 0xfc)) [[unlikely]]
280  throw_for_encoding_error("SJIS", buffer, start, 1, loc);
281 
282  if (start + 2 > std::size(buffer)) [[unlikely]]
283  throw_for_truncated_character("SJIS", buffer, start, loc);
284 
285  return start + 2;
286  }
287 };
288 
289 
291 
301 template<char... NEEDLE>
302 PQXX_PURE
305 {
306  // All characters in NEEDLE must be ASCII.
307  static_assert((... and (static_cast<unsigned char>(NEEDLE) < 0x80)));
308 
309  // We don't support searching for a NEEDLE that's a letter. This allows us
310  // to lump UHC in with the more efficient ASCII-safe group.
311  static_assert((... and not between_inc(NEEDLE, 'A', 'Z')));
312  static_assert((... and not between_inc(NEEDLE, 'a', 'z')));
313 
314  switch (enc)
315  {
317  throw pqxx::argument_error{
318  "Tried to read text without knowing its encoding.", loc};
319 
322  encoding_group::ascii_safe, NEEDLE...>;
325  encoding_group::two_tier, NEEDLE...>;
330 
331  default:
332  throw pqxx::internal_error{
333  std::format(
334  "Unexpected encoding group: {}.",
335  static_cast<std::underlying_type_t<encoding_group>>(enc)),
336  loc};
337  }
338 }
339 } // namespace pqxx::internal
340 #endif
Invalid argument passed to libpqxx, similar to std::invalid_argument.
Definition: except.hxx:599
Internal error in libpqxx library.
Definition: except.hxx:558
#define PQXX_ZARGS
Definition: header-pre.hxx:144
#define PQXX_COLD
Definition: header-pre.hxx:80
#define PQXX_RESTRICT
Definition: header-pre.hxx:153
#define PQXX_LIBEXPORT
Definition: header-pre.hxx:225
#define PQXX_PURE
Definition: header-pre.hxx:64
#define PQXX_ASSUME(condition)
Definition: header-pre.hxx:247
#define PQXX_HOT
Definition: header-pre.hxx:72
#define PQXX_INLINE_COV
Don't generate out-of-line version of inline function for coverage runs.
Definition: header-pre.hxx:106
#define PQXX_RETURNS_NONNULL
Definition: header-pre.hxx:127
#define PQXX_INLINE_ONLY
Definition: header-pre.hxx:91
Private namespace for libpqxx's internal use; do not access.
Definition: connection.cxx:333
void throw_for_truncated_character(char const *encoding, std::string_view buffer, std::size_t start, sl loc)
Throw an error reporting that the input is truncated in mid-character.
Definition: encodings.cxx:273
PQXX_PURE PQXX_RETURNS_NONNULL constexpr PQXX_INLINE_COV char_finder_func * get_char_finder(encoding_group enc, sl loc)
Look up a character search function for an encoding group.
Definition: encodings.hxx:304
PQXX_PURE PQXX_INLINE_ONLY constexpr PQXX_HOT unsigned char get_byte(std::string_view buffer, std::size_t offset) noexcept
Extract byte from buffer, return as unsigned char.
Definition: encodings.hxx:34
void throw_for_encoding_error(char const *encoding, std::string_view buffer, std::size_t start, std::size_t count, sl loc)
Throw an error reporting that input text is not properly encoded.
Definition: encodings.cxx:261
PQXX_INLINE_COV constexpr PQXX_HOT std::size_t find_ascii_char(std::string_view haystack, std::size_t here, sl loc)
Find any of the ASCII characters in NEEDLE in haystack.
Definition: encodings.hxx:102
PQXX_PURE PQXX_INLINE_ONLY constexpr PQXX_HOT bool between_inc(unsigned char value, unsigned bottom, unsigned top) noexcept
Does value lie between bottom and top, inclusive?
Definition: encodings.hxx:72
char const * name_encoding(int encoding_id) noexcept
Return PostgreSQL's name for encoding enum value.
Definition: encodings.cxx:234
constexpr encoding_group enc_group(std::string_view encoding_name, sl loc)
Look up encoding group for an encoding by name.
Definition: encodings.cxx:56
std::size_t(std::string_view haystack, std::size_t start, sl) char_finder_func
Function type: "find first occurrence of any of these ASCII characters.".
Definition: encoding_group.hxx:110
std::source_location sl
Convenience alias for std::source_location. It's just too long.
Definition: types.hxx:38
encoding_group
Definition: encoding_group.hxx:40
@ two_tier
Low byte is ASCII, high byte starts a 2-byte character.
@ sjis
Non-ASCII-safe: Japanese JIS and Shift JIS.
@ unknown
Default: indeterminate encoding. All we know is it supports ASCII.
@ ascii_safe
"ASCII-safe" encodings.
@ gb18030
Non-ASCII-safe: GB18030 for Chinese (Traditional & Simplified).
format
Format code: is data text or binary?
Definition: types.hxx:121
PQXX_INLINE_ONLY PQXX_PURE static constexpr PQXX_HOT std::size_t call(std::string_view, std::size_t start, sl) noexcept
Definition: encodings.hxx:157
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:218
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:263
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:181
Wrapper struct template for "find next glyph" functions.
Definition: encodings.hxx:84
static constexpr std::size_t call(std::string_view, std::size_t start, sl)
Find the next glyph in buffer after position start.