libpqxx
The C++ client library for PostgreSQL
encodings.hxx
Go to the documentation of this file.
1 
9 #ifndef PQXX_INTERNAL_ENCODINGS_HXX
10 #define PQXX_INTERNAL_ENCODINGS_HXX
11 
12 #include <cassert>
13 #include <iomanip>
14 #include <string>
15 #include <string_view>
16 
17 #include "pqxx/encoding_group.hxx"
18 #include "pqxx/strconv.hxx"
19 
20 
21 namespace pqxx
22 {
24 } // namespace pqxx
25 
26 
27 namespace pqxx::internal
28 {
30 PQXX_PURE char const *name_encoding(int encoding_id) noexcept;
31 
34 enc_group(int /* libpq encoding ID */, sl);
35 
36 
38 
39 PQXX_PURE PQXX_INLINE_ONLY constexpr inline unsigned char
40 get_byte(std::string_view buffer, std::size_t offset) noexcept
41 {
42  assert(offset < std::size(buffer));
43  return static_cast<unsigned char>(buffer[offset]);
44 }
45 
46 
48 
56  char const *encoding_name, std::string_view buffer, std::size_t start,
57  std::size_t count, sl loc);
58 
59 
61 
69 [[noreturn]] PQXX_COLD PQXX_LIBEXPORT PQXX_ZARGS void
71  char const *encoding_name, std::string_view buffer, std::size_t start,
72  sl loc);
73 
74 
76 
77 PQXX_PURE PQXX_INLINE_ONLY constexpr inline bool
78 between_inc(unsigned char value, unsigned bottom, unsigned top) noexcept
79 {
80  return value >= bottom and value <= top;
81 }
82 
83 
85 
89 template<encoding_group> struct glyph_scanner final
90 {
92 
94  static constexpr inline std::size_t
95  call(std::string_view, std::size_t start, sl);
96 };
97 
98 
100 
106 template<encoding_group ENC, char... NEEDLE>
107 PQXX_INLINE_COV inline constexpr std::size_t
108 find_ascii_char(std::string_view haystack, std::size_t here, sl loc)
109 {
110  // We only know how to search for ASCII characters. It's an optimisation
111  // assumption in the code below.
112  static_assert((... and ((NEEDLE & 0x80) == 0)));
113 
114  auto const sz{std::size(haystack)};
115  auto const data{std::data(haystack)};
116  while (here < sz)
117  {
118  // Look up the next character boundary. This can be quite costly, so we
119  // desperately want the call inlined.
120  auto next{glyph_scanner<ENC>::call(haystack, here, loc)};
121  PQXX_ASSUME(next > here);
122 
123  // (For some reason gcc had a problem with a right-fold here. But clang
124  // was fine.)
125  //
126  // In all supported encodings, if a character's first byte is in the ASCII
127  // range, that means it's a single-byte character. It follows that when we
128  // find a match at a position that's the beginning of a character, we do
129  // not need to check that we're in a single-byte character. We are.
130  //
131  // So, we only ever need to check each character's first byte, and if it
132  // doesn't match, move on to the next character.
133  //
134  // As an optimisation for "ASCII-safe" encodings however, we just check
135  // every byte in the text. It's going to be faster than finding character
136  // boundaries first. In these encodings, a multichar byte never contains
137  // any bytes in the ASCII range at all.
138  if ((... or (data[here] == NEEDLE)))
139  return here;
140 
141  // Nope, no hit. Move on.
142  here = next;
143  }
144  return sz;
145 }
146 
147 
149 
160 template<> struct glyph_scanner<encoding_group::ascii_safe> final
161 {
162  PQXX_INLINE_ONLY PQXX_PURE static constexpr std::size_t
163  call(std::string_view, std::size_t start, sl) noexcept
164  {
165  return start + 1;
166  }
167 };
168 
169 
171 
184 template<> struct glyph_scanner<encoding_group::two_tier> final
185 {
186  PQXX_INLINE_ONLY static constexpr std::size_t
187  call(std::string_view buffer, std::size_t start, sl loc)
188  {
189  auto const byte1{get_byte(buffer, start)};
190  if (byte1 < 0x80)
191  {
192  // Single-byte ASCII subset.
193  return start + 1;
194  }
195  else if (start + 2 <= std::size(buffer))
196  {
197  // Two-byte character. Not all combinations are valid, but that's not
198  // our concern. All that matters to libpqxx is that it not mistake an
199  // ASCII-like value in the second byte for a special character, or vice
200  // versa.
201  return start + 2;
202  }
203  else
204  {
205  // We do need to ensure that the string does not end in the middle of
206  // a character, or an attacker could "steal" a special ASCII character
207  // that comes directly after the end of the input, and escape the bounds
208  // of the text that way.
209  [[unlikely]] throw_for_truncated_character(
210  "variable-width two-byte encoding", buffer, start, loc);
211  }
212  }
213 };
214 
215 
217 
221 template<> struct glyph_scanner<encoding_group::gb18030> final
222 {
223  PQXX_INLINE_ONLY static constexpr std::size_t
224  call(std::string_view buffer, std::size_t start, sl loc)
225  {
226  auto const byte1{get_byte(buffer, start)};
227  if (byte1 < 0x80)
228  return start + 1;
229  auto const sz{std::size(buffer)};
230  if (byte1 == 0x80)
231  throw_for_encoding_error("GB18030", buffer, start, sz - start, loc);
232 
233  if (start + 2 > sz) [[unlikely]]
234  throw_for_truncated_character("GB18030", buffer, start, loc);
235 
236  auto const byte2{get_byte(buffer, start + 1)};
237  if (between_inc(byte2, 0x40, 0xfe))
238  {
239  if (byte2 == 0x7f) [[unlikely]]
240  throw_for_encoding_error("GB18030", buffer, start, 2, loc);
241 
242  return start + 2;
243  }
244 
245  if (start + 4 > sz) [[unlikely]]
246  throw_for_truncated_character("GB18030", buffer, start, loc);
247 
248  if (
249  between_inc(byte2, 0x30, 0x39) and
250  between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and
251  between_inc(get_byte(buffer, start + 3), 0x30, 0x39))
252  return start + 4;
253 
254  [[unlikely]] throw_for_encoding_error("GB18030", buffer, start, 4, loc);
255  }
256 };
257 
258 
260 
266 template<> struct glyph_scanner<encoding_group::sjis> final
267 {
268  PQXX_INLINE_ONLY static constexpr std::size_t
269  call(std::string_view buffer, std::size_t start, sl loc)
270  {
271  auto const byte1{get_byte(buffer, start)};
272  if (byte1 < 0x80)
273  // ASCII subset (though some characters changed).
274  return start + 1;
275  if (between_inc(byte1, 0xa1, 0xdf))
276  // Katakana, also single-byte characters.
277  return start + 1;
278 
279  // We're a bit strict at checking the first byte, because this is a
280  // relatively complex encoding. We don't want to get fooled by some
281  // extension we don't know about. An error and a user complaint is still
282  // better than a lurking bug.
283  if (
284  not between_inc(byte1, 0x81, 0x9f) and
285  not between_inc(byte1, 0xe0, 0xfc)) [[unlikely]]
286  throw_for_encoding_error("SJIS", buffer, start, 1, loc);
287 
288  if (start + 2 > std::size(buffer)) [[unlikely]]
289  throw_for_truncated_character("SJIS", buffer, start, loc);
290 
291  return start + 2;
292  }
293 };
294 
295 
297 
307 template<char... NEEDLE>
308 PQXX_PURE
311 {
312  // All characters in NEEDLE must be ASCII.
313  static_assert((... and (static_cast<unsigned char>(NEEDLE) < 0x80)));
314 
315  // We don't support searching for a NEEDLE that's a letter. This allows us
316  // to lump UHC in with the more efficient ASCII-safe group.
317  static_assert((... and not between_inc(NEEDLE, 'A', 'Z')));
318  static_assert((... and not between_inc(NEEDLE, 'a', 'z')));
319 
320  switch (enc)
321  {
323  throw pqxx::argument_error{
324  "Tried to read text without knowing its encoding.", loc};
325 
328  encoding_group::ascii_safe, NEEDLE...>;
331  encoding_group::two_tier, NEEDLE...>;
336 
337  default:
338  throw pqxx::internal_error{
339  std::format("Unexpected encoding group: {}.", to_string(enc)), loc};
340  }
341 }
342 } // namespace pqxx::internal
343 #endif
Invalid argument passed to libpqxx, similar to std::invalid_argument.
Definition: except.hxx:599
Internal error in libpqxx library.
Definition: except.hxx:558
#define PQXX_ZARGS
Definition: header-pre.hxx:136
#define PQXX_COLD
Definition: header-pre.hxx:72
#define PQXX_LIBEXPORT
Definition: header-pre.hxx:206
#define PQXX_PURE
Definition: header-pre.hxx:64
#define PQXX_ASSUME(condition)
Definition: header-pre.hxx:228
#define PQXX_INLINE_COV
Don't generate out-of-line version of inline function for coverage runs.
Definition: header-pre.hxx:98
#define PQXX_RETURNS_NONNULL
Definition: header-pre.hxx:119
#define PQXX_INLINE_ONLY
Definition: header-pre.hxx:83
Private namespace for libpqxx's internal use; do not access.
Definition: connection.cxx:333
void throw_for_truncated_character(char const *encoding, std::string_view buffer, std::size_t start, sl loc)
Throw an error reporting that the input is truncated in mid-character.
Definition: encodings.cxx:273
PQXX_PURE PQXX_RETURNS_NONNULL constexpr PQXX_INLINE_COV char_finder_func * get_char_finder(encoding_group enc, sl loc)
Look up a character search function for an encoding group.
Definition: encodings.hxx:310
void throw_for_encoding_error(char const *encoding, std::string_view buffer, std::size_t start, std::size_t count, sl loc)
Throw an error reporting that input text is not properly encoded.
Definition: encodings.cxx:261
PQXX_PURE constexpr PQXX_INLINE_ONLY bool between_inc(unsigned char value, unsigned bottom, unsigned top) noexcept
Does value lie between bottom and top, inclusive?
Definition: encodings.hxx:78
char const * name_encoding(int encoding_id) noexcept
Return PostgreSQL's name for encoding enum value.
Definition: encodings.cxx:234
constexpr encoding_group enc_group(std::string_view encoding_name, sl loc)
Look up encoding group for an encoding by name.
Definition: encodings.cxx:56
constexpr PQXX_INLINE_COV std::size_t find_ascii_char(std::string_view haystack, std::size_t here, sl loc)
Find any of the ASCII characters in NEEDLE in haystack.
Definition: encodings.hxx:108
PQXX_PURE constexpr PQXX_INLINE_ONLY unsigned char get_byte(std::string_view buffer, std::size_t offset) noexcept
Extract byte from buffer, return as unsigned char.
Definition: encodings.hxx:40
std::size_t(std::string_view haystack, std::size_t start, sl) char_finder_func
Function type: "find first occurrence of any of these ASCII characters.".
Definition: encoding_group.hxx:110
The home of all libpqxx classes, functions, templates, etc.
Definition: array.cxx:26
std::source_location sl
Convenience alias for std::source_location. It's just too long.
Definition: types.hxx:38
PQXX_LIBEXPORT std::string to_string(field_ref const &value, ctx)
Convert a field_ref to a string.
Definition: field.hxx:891
encoding_group
Definition: encoding_group.hxx:40
@ two_tier
Low byte is ASCII, high byte starts a 2-byte character.
@ sjis
Non-ASCII-safe: Japanese JIS and Shift JIS.
@ unknown
Default: indeterminate encoding. All we know is it supports ASCII.
@ ascii_safe
"ASCII-safe" encodings.
@ gb18030
Non-ASCII-safe: GB18030 for Chinese (Traditional & Simplified).
format
Format code: is data text or binary?
Definition: types.hxx:121
#define PQXX_DECLARE_ENUM_CONVERSION(ENUM)
Macro: Define a string conversion for an enum type.
Definition: strconv.hxx:617
PQXX_INLINE_ONLY static constexpr PQXX_PURE std::size_t call(std::string_view, std::size_t start, sl) noexcept
Definition: encodings.hxx:163
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:224
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:269
static constexpr PQXX_INLINE_ONLY std::size_t call(std::string_view buffer, std::size_t start, sl loc)
Definition: encodings.hxx:187
Wrapper struct template for "find next glyph" functions.
Definition: encodings.hxx:90
static constexpr std::size_t call(std::string_view, std::size_t start, sl)
Find the next glyph in buffer after position start.