TerraForge3D  2.3.1
3D Terrain And Landscape Generator

◆ scan_string()

template<typename BasicJsonType , typename InputAdapterType >
token_type nlohmann::detail::lexer< BasicJsonType, InputAdapterType >::scan_string ( )
inlineprivate

scan a string literal

This function scans a string according to Sect. 7 of RFC 8259. While scanning, bytes are escaped and copied into buffer token_buffer. Then the function returns successfully, token_buffer is not null-terminated (as it may contain \0 bytes), and token_buffer.size() is the number of bytes in the string.

Returns
token_type::value_string if string could be successfully scanned, token_type::parse_error otherwise
Note
In case of errors, variable error_message contains a textual description.

Definition at line 6839 of file json.hpp.

6840 {
6841 // reset token_buffer (ignore opening quote)
6842 reset();
6843 // we entered the function by reading an open quote
6844 JSON_ASSERT(current == '\"');
6845
6846 while (true)
6847 {
6848 // get next character
6849 switch (get())
6850 {
6851 // end of file while parsing string
6852 case std::char_traits<char_type>::eof():
6853 {
6854 error_message = "invalid string: missing closing quote";
6855 return token_type::parse_error;
6856 }
6857
6858 // closing quote
6859 case '\"':
6860 {
6861 return token_type::value_string;
6862 }
6863
6864 // escapes
6865 case '\\':
6866 {
6867 switch (get())
6868 {
6869 // quotation mark
6870 case '\"':
6871 add('\"');
6872 break;
6873
6874 // reverse solidus
6875 case '\\':
6876 add('\\');
6877 break;
6878
6879 // solidus
6880 case '/':
6881 add('/');
6882 break;
6883
6884 // backspace
6885 case 'b':
6886 add('\b');
6887 break;
6888
6889 // form feed
6890 case 'f':
6891 add('\f');
6892 break;
6893
6894 // line feed
6895 case 'n':
6896 add('\n');
6897 break;
6898
6899 // carriage return
6900 case 'r':
6901 add('\r');
6902 break;
6903
6904 // tab
6905 case 't':
6906 add('\t');
6907 break;
6908
6909 // unicode escapes
6910 case 'u':
6911 {
6912 const int codepoint1 = get_codepoint();
6913 int codepoint = codepoint1; // start with codepoint1
6914
6915 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
6916 {
6917 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
6918 return token_type::parse_error;
6919 }
6920
6921 // check if code point is a high surrogate
6922 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
6923 {
6924 // expect next \uxxxx entry
6925 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
6926 {
6927 const int codepoint2 = get_codepoint();
6928
6929 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
6930 {
6931 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
6932 return token_type::parse_error;
6933 }
6934
6935 // check if codepoint2 is a low surrogate
6936 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
6937 {
6938 // overwrite codepoint
6939 codepoint = static_cast<int>(
6940 // high surrogate occupies the most significant 22 bits
6941 (static_cast<unsigned int>(codepoint1) << 10u)
6942 // low surrogate occupies the least significant 15 bits
6943 + static_cast<unsigned int>(codepoint2)
6944 // there is still the 0xD800, 0xDC00 and 0x10000 noise
6945 // in the result so we have to subtract with:
6946 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
6947 - 0x35FDC00u);
6948 }
6949
6950 else
6951 {
6952 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
6953 return token_type::parse_error;
6954 }
6955 }
6956
6957 else
6958 {
6959 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
6960 return token_type::parse_error;
6961 }
6962 }
6963
6964 else
6965 {
6966 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
6967 {
6968 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
6969 return token_type::parse_error;
6970 }
6971 }
6972
6973 // result of the above calculation yields a proper codepoint
6974 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
6975
6976 // translate codepoint into bytes
6977 if (codepoint < 0x80)
6978 {
6979 // 1-byte characters: 0xxxxxxx (ASCII)
6980 add(static_cast<char_int_type>(codepoint));
6981 }
6982
6983 else if (codepoint <= 0x7FF)
6984 {
6985 // 2-byte characters: 110xxxxx 10xxxxxx
6986 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
6987 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
6988 }
6989
6990 else if (codepoint <= 0xFFFF)
6991 {
6992 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
6993 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
6994 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
6995 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
6996 }
6997
6998 else
6999 {
7000 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
7001 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
7002 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
7003 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
7004 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
7005 }
7006
7007 break;
7008 }
7009
7010 // other characters after escape
7011 default:
7012 error_message = "invalid string: forbidden character after backslash";
7013 return token_type::parse_error;
7014 }
7015
7016 break;
7017 }
7018
7019 // invalid control characters
7020 case 0x00:
7021 {
7022 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
7023 return token_type::parse_error;
7024 }
7025
7026 case 0x01:
7027 {
7028 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
7029 return token_type::parse_error;
7030 }
7031
7032 case 0x02:
7033 {
7034 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
7035 return token_type::parse_error;
7036 }
7037
7038 case 0x03:
7039 {
7040 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
7041 return token_type::parse_error;
7042 }
7043
7044 case 0x04:
7045 {
7046 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
7047 return token_type::parse_error;
7048 }
7049
7050 case 0x05:
7051 {
7052 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
7053 return token_type::parse_error;
7054 }
7055
7056 case 0x06:
7057 {
7058 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
7059 return token_type::parse_error;
7060 }
7061
7062 case 0x07:
7063 {
7064 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
7065 return token_type::parse_error;
7066 }
7067
7068 case 0x08:
7069 {
7070 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
7071 return token_type::parse_error;
7072 }
7073
7074 case 0x09:
7075 {
7076 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
7077 return token_type::parse_error;
7078 }
7079
7080 case 0x0A:
7081 {
7082 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
7083 return token_type::parse_error;
7084 }
7085
7086 case 0x0B:
7087 {
7088 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
7089 return token_type::parse_error;
7090 }
7091
7092 case 0x0C:
7093 {
7094 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
7095 return token_type::parse_error;
7096 }
7097
7098 case 0x0D:
7099 {
7100 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
7101 return token_type::parse_error;
7102 }
7103
7104 case 0x0E:
7105 {
7106 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
7107 return token_type::parse_error;
7108 }
7109
7110 case 0x0F:
7111 {
7112 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
7113 return token_type::parse_error;
7114 }
7115
7116 case 0x10:
7117 {
7118 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
7119 return token_type::parse_error;
7120 }
7121
7122 case 0x11:
7123 {
7124 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
7125 return token_type::parse_error;
7126 }
7127
7128 case 0x12:
7129 {
7130 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
7131 return token_type::parse_error;
7132 }
7133
7134 case 0x13:
7135 {
7136 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
7137 return token_type::parse_error;
7138 }
7139
7140 case 0x14:
7141 {
7142 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
7143 return token_type::parse_error;
7144 }
7145
7146 case 0x15:
7147 {
7148 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
7149 return token_type::parse_error;
7150 }
7151
7152 case 0x16:
7153 {
7154 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
7155 return token_type::parse_error;
7156 }
7157
7158 case 0x17:
7159 {
7160 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
7161 return token_type::parse_error;
7162 }
7163
7164 case 0x18:
7165 {
7166 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
7167 return token_type::parse_error;
7168 }
7169
7170 case 0x19:
7171 {
7172 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
7173 return token_type::parse_error;
7174 }
7175
7176 case 0x1A:
7177 {
7178 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
7179 return token_type::parse_error;
7180 }
7181
7182 case 0x1B:
7183 {
7184 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
7185 return token_type::parse_error;
7186 }
7187
7188 case 0x1C:
7189 {
7190 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
7191 return token_type::parse_error;
7192 }
7193
7194 case 0x1D:
7195 {
7196 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
7197 return token_type::parse_error;
7198 }
7199
7200 case 0x1E:
7201 {
7202 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
7203 return token_type::parse_error;
7204 }
7205
7206 case 0x1F:
7207 {
7208 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
7209 return token_type::parse_error;
7210 }
7211
7212 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
7213 case 0x20:
7214 case 0x21:
7215 case 0x23:
7216 case 0x24:
7217 case 0x25:
7218 case 0x26:
7219 case 0x27:
7220 case 0x28:
7221 case 0x29:
7222 case 0x2A:
7223 case 0x2B:
7224 case 0x2C:
7225 case 0x2D:
7226 case 0x2E:
7227 case 0x2F:
7228 case 0x30:
7229 case 0x31:
7230 case 0x32:
7231 case 0x33:
7232 case 0x34:
7233 case 0x35:
7234 case 0x36:
7235 case 0x37:
7236 case 0x38:
7237 case 0x39:
7238 case 0x3A:
7239 case 0x3B:
7240 case 0x3C:
7241 case 0x3D:
7242 case 0x3E:
7243 case 0x3F:
7244 case 0x40:
7245 case 0x41:
7246 case 0x42:
7247 case 0x43:
7248 case 0x44:
7249 case 0x45:
7250 case 0x46:
7251 case 0x47:
7252 case 0x48:
7253 case 0x49:
7254 case 0x4A:
7255 case 0x4B:
7256 case 0x4C:
7257 case 0x4D:
7258 case 0x4E:
7259 case 0x4F:
7260 case 0x50:
7261 case 0x51:
7262 case 0x52:
7263 case 0x53:
7264 case 0x54:
7265 case 0x55:
7266 case 0x56:
7267 case 0x57:
7268 case 0x58:
7269 case 0x59:
7270 case 0x5A:
7271 case 0x5B:
7272 case 0x5D:
7273 case 0x5E:
7274 case 0x5F:
7275 case 0x60:
7276 case 0x61:
7277 case 0x62:
7278 case 0x63:
7279 case 0x64:
7280 case 0x65:
7281 case 0x66:
7282 case 0x67:
7283 case 0x68:
7284 case 0x69:
7285 case 0x6A:
7286 case 0x6B:
7287 case 0x6C:
7288 case 0x6D:
7289 case 0x6E:
7290 case 0x6F:
7291 case 0x70:
7292 case 0x71:
7293 case 0x72:
7294 case 0x73:
7295 case 0x74:
7296 case 0x75:
7297 case 0x76:
7298 case 0x77:
7299 case 0x78:
7300 case 0x79:
7301 case 0x7A:
7302 case 0x7B:
7303 case 0x7C:
7304 case 0x7D:
7305 case 0x7E:
7306 case 0x7F:
7307 {
7308 add(current);
7309 break;
7310 }
7311
7312 // U+0080..U+07FF: bytes C2..DF 80..BF
7313 case 0xC2:
7314 case 0xC3:
7315 case 0xC4:
7316 case 0xC5:
7317 case 0xC6:
7318 case 0xC7:
7319 case 0xC8:
7320 case 0xC9:
7321 case 0xCA:
7322 case 0xCB:
7323 case 0xCC:
7324 case 0xCD:
7325 case 0xCE:
7326 case 0xCF:
7327 case 0xD0:
7328 case 0xD1:
7329 case 0xD2:
7330 case 0xD3:
7331 case 0xD4:
7332 case 0xD5:
7333 case 0xD6:
7334 case 0xD7:
7335 case 0xD8:
7336 case 0xD9:
7337 case 0xDA:
7338 case 0xDB:
7339 case 0xDC:
7340 case 0xDD:
7341 case 0xDE:
7342 case 0xDF:
7343 {
7344 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
7345 {
7346 return token_type::parse_error;
7347 }
7348 break;
7349 }
7350
7351 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
7352 case 0xE0:
7353 {
7354 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
7355 {
7356 return token_type::parse_error;
7357 }
7358 break;
7359 }
7360
7361 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
7362 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
7363 case 0xE1:
7364 case 0xE2:
7365 case 0xE3:
7366 case 0xE4:
7367 case 0xE5:
7368 case 0xE6:
7369 case 0xE7:
7370 case 0xE8:
7371 case 0xE9:
7372 case 0xEA:
7373 case 0xEB:
7374 case 0xEC:
7375 case 0xEE:
7376 case 0xEF:
7377 {
7378 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
7379 {
7380 return token_type::parse_error;
7381 }
7382 break;
7383 }
7384
7385 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
7386 case 0xED:
7387 {
7388 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
7389 {
7390 return token_type::parse_error;
7391 }
7392 break;
7393 }
7394
7395 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
7396 case 0xF0:
7397 {
7398 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
7399 {
7400 return token_type::parse_error;
7401 }
7402 break;
7403 }
7404
7405 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
7406 case 0xF1:
7407 case 0xF2:
7408 case 0xF3:
7409 {
7410 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
7411 {
7412 return token_type::parse_error;
7413 }
7414 break;
7415 }
7416
7417 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
7418 case 0xF4:
7419 {
7420 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
7421 {
7422 return token_type::parse_error;
7423 }
7424 break;
7425 }
7426
7427 // remaining bytes (80..C1 and F5..FF) are ill-formed
7428 default:
7429 {
7430 error_message = "invalid string: ill-formed UTF-8 byte";
7431 return token_type::parse_error;
7432 }
7433 }
7434 }
7435 }
void reset() noexcept
reset token_buffer; current character is beginning of token
Definition: json.hpp:7924
char_int_type current
the current character
Definition: json.hpp:8219