diff --git a/lib/input/src/parse/lexer_impl.xch b/lib/input/src/parse/lexer_impl.xch index bf65c2e67..bcefde22b 100644 --- a/lib/input/src/parse/lexer_impl.xch +++ b/lib/input/src/parse/lexer_impl.xch @@ -27,7 +27,8 @@ namespace CppClingo::Input::Parse { WS = [\t\r\n ]*; IDENTIFIER = [_']*[a-z]['A-Za-z0-9_]*; VARIABLE = [_']*[A-Z]['A-Za-z0-9_]*; - STRING = "\"" ([^\\"\n\000]|"\\\""|"\\\\"|"\\n")* "\""; + UNICODE = "\\u{" [0-9A-Fa-f]+ "}"; + STRING = "\"" ([^\\"\n\000]|"\\\""|"\\\\"|"\\n"|"\\t"|"\\r"|UNICODE)* "\""; INCSTRING = "<" IDENTIFIER ">"; THEORYOP = [/!<=>+\-*\\?&@|:;~^.]+; KEYWORD = "#" [a-zA-Z0-9_]*; @@ -49,7 +50,7 @@ namespace CppClingo::Input::Parse { SIGN = [-+ ]; GROUPING = [,_]; TYPE = [bcdoxXns]; - FLIT = ([^\\"\n{}\000] | "\\\"" | "\\\\" | "\\n" | "{{" | "}}")*; + FLIT = ([^\\"\n{}\000] | "\\\"" | "\\\\" | "\\n" | "\\t" | "\\r" | UNICODE | "{{" | "}}")*; FSPEC = ACCESSOR* CONVERSION? ([:] (DOT? ALIGN)? SIGN? [#]? POS? GROUPING? TYPE?)?; */ diff --git a/lib/input/src/parse/term.cc b/lib/input/src/parse/term.cc index d4d78935d..b3206773f 100644 --- a/lib/input/src/parse/term.cc +++ b/lib/input/src/parse/term.cc @@ -205,7 +205,9 @@ auto cont_fstr(ParserState &state) -> bool { if (auto view = state.view().substr(0, state.view().size() - 1); !view.empty()) { auto loc = Location{state.cursor_pos(), Position(state.file(), state.token_line(), state.token_column() - 1)}; - str.fields.emplace_back(std::in_place_type, loc, state.store().string(view)); + auto &buf = state.buf(view.size()); + Util::unquote(view, std::back_inserter(buf), true); + str.fields.emplace_back(std::in_place_type, loc, state.store().string(buf)); } state.consume(); state.push(Prod::fstr_field); @@ -214,7 +216,9 @@ auto cont_fstr(ParserState &state) -> bool { if (auto view = state.view().substr(0, state.view().size() - 1); !view.empty()) { auto loc = Location{state.cursor_pos(), Position(state.file(), state.token_line(), state.token_column() - 1)}; - str.fields.emplace_back(std::in_place_type, loc, state.store().string(view)); + auto &buf = state.buf(view.size()); + Util::unquote(view, std::back_inserter(buf), true); + str.fields.emplace_back(std::in_place_type, loc, state.store().string(buf)); } auto fields = std::move(str.fields); auto start = Position{state.file(), str.line, str.column}; diff --git a/lib/input/tests/parser.cc b/lib/input/tests/parser.cc index f2c5688c3..a7c40b927 100644 --- a/lib/input/tests/parser.cc +++ b/lib/input/tests/parser.cc @@ -86,6 +86,21 @@ TEST_CASE("parsev2") { REQUIRE(parse("(a,;a)") == "(a,;a)"); REQUIRE(parse("f(;)") == "f(;)"); REQUIRE(parse("f(\"x\")") == "f(\"x\")"); + REQUIRE(parse("f(\"\\n\")") == "f(\"\\n\")"); + REQUIRE(parse("f(\"\\t\")") == "f(\"\\t\")"); + REQUIRE(parse("f(\"\\r\")") == "f(\"\\r\")"); + REQUIRE(parse("f(\"\\u{041}\")") == "f(\"A\")"); + REQUIRE(parse("f(\"\\u{2665}\")") == "f(\"♥\")"); + REQUIRE(parse("f(\"\\u{1F602}\")") == "f(\"😂\")"); + REQUIRE(parse("f(\"\\u{FF}\")") == "f(\"\xC3\xBF\")"); + REQUIRE(parse("f(\"\\u{1F600}\")") == "f(\"\xF0\x9F\x98\x80\")"); + REQUIRE(parse("f(\"\\u{10FFFF}\")") == "f(\"\xF4\x8F\xBF\xBF\")"); + REQUIRE(parse("f(\"hello\\u{00E9}\\nworld\")") == "f(\"helloé\\nworld\")"); + REQUIRE(parse("f\"hello\\nworld\"") == "f\"hello\\nworld\""); + REQUIRE(parse("f\"hello\\tworld\"") == "f\"hello\\tworld\""); + REQUIRE(parse("f\"hello\\rworld\"") == "f\"hello\\rworld\""); + REQUIRE(parse("f\"{{hello}}\"") == "f\"{{hello}}\""); + REQUIRE(parse("f\"hello\\u{00E9}\\nworld\"") == "f\"helloé\\nworld\""); REQUIRE(parse("a+b+c") == "a+b+c"); REQUIRE(parse("a*b+c") == "a*b+c"); REQUIRE(parse("a+b*c") == "a+b*c"); diff --git a/lib/python-api/tests/test_write_aspif.py b/lib/python-api/tests/test_write_aspif.py index bb4b6a01d..092bb2877 100644 --- a/lib/python-api/tests/test_write_aspif.py +++ b/lib/python-api/tests/test_write_aspif.py @@ -121,9 +121,7 @@ def test_buffer_inc(self): ctl.parse_string("""{c}. #show d : a, c.""") ctl.ground() ctl.solve() - assert ( - ctl.buffer - == """\ + assert ctl.buffer == """\ asp 2 0 0 incremental 1 1 1 1 0 0 4 1 0 1 b @@ -135,7 +133,6 @@ def test_buffer_inc(self): 4 0 2 1 c 0 """ - ) def test_rule(self): """ diff --git a/lib/util/include/clingo/util/print.hh b/lib/util/include/clingo/util/print.hh index 4e902434b..91c6ebd5c 100644 --- a/lib/util/include/clingo/util/print.hh +++ b/lib/util/include/clingo/util/print.hh @@ -313,6 +313,8 @@ class PrintQuoted { out << "\\n"; } else if (c == '\t') { out << "\\t"; + } else if (c == '\r') { + out << "\\r"; } else if (c == '"') { out << "\\\""; } else { diff --git a/lib/util/include/clingo/util/string.hh b/lib/util/include/clingo/util/string.hh index 7967edfe3..cec0669d6 100644 --- a/lib/util/include/clingo/util/string.hh +++ b/lib/util/include/clingo/util/string.hh @@ -1,52 +1,94 @@ #pragma once -#include +#include +#include #include namespace CppClingo::Util { -void quote(std::string_view in, auto out) { - for (auto c : in) { - switch (c) { - case '\n': { - *out++ = '\\'; - *out++ = 'n'; - break; - } - case '\t': { - *out++ = '\\'; - *out++ = 't'; - break; - } - case '\\': { - *out++ = '\\'; - *out++ = '\\'; - break; - } - case '"': { - *out++ = '\\'; - *out++ = '"'; - break; - } - default: { - *out++ = c; - break; +namespace Detail { + +// NOLINTBEGIN(readability-magic-numbers) + +inline auto hex_val(char c) -> uint32_t { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'F') { + return 10 + (c - 'A'); + } + if (c >= 'a' && c <= 'f') { + return 10 + (c - 'a'); + } + throw std::invalid_argument("invalid hex character"); +} + +inline auto encode_utf8(uint32_t cp, auto out) -> void { + if (cp > 0x10FFFF) { + throw std::invalid_argument("invalid unicode code point"); + } + if (cp <= 0x7F) { + *out++ = static_cast(cp); + } else if (cp <= 0x7FF) { + *out++ = static_cast(0xC0 | (cp >> 6)); + *out++ = static_cast(0x80 | (cp & 0x3F)); + } else if (cp <= 0xFFFF) { + *out++ = static_cast(0xE0 | (cp >> 12)); + *out++ = static_cast(0x80 | ((cp >> 6) & 0x3F)); + *out++ = static_cast(0x80 | (cp & 0x3F)); + } else { + *out++ = static_cast(0xF0 | (cp >> 18)); + *out++ = static_cast(0x80 | ((cp >> 12) & 0x3F)); + *out++ = static_cast(0x80 | ((cp >> 6) & 0x3F)); + *out++ = static_cast(0x80 | (cp & 0x3F)); + } +} + +inline auto parse_unicode_escape(auto it, auto ie, auto out) -> auto { + uint32_t cp = 0; + if (it == ie || *it != '{') { + throw std::runtime_error("expected '{' at the beginning of unicode escape"); + } + size_t count = 0; + for (++it; it != ie; ++it, ++count) { + if (*it == '}') { + if (count == 0) { + throw std::runtime_error("expected at least one hex digit in unicode escape"); } + encode_utf8(cp, out); + return it; } + if (count >= 6) { + throw std::runtime_error("too many hex digits in unicode escape"); + } + cp = (cp << 4) | hex_val(*it); } + throw std::runtime_error("expected '}' at the end of unicode escape"); } + +// NOLINTEND(readability-magic-numbers) + +} // namespace Detail + void unquote(std::string_view in, auto out, bool fstring = false) { auto escape = '\0'; - for (auto c : in) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + for (auto it = in.begin(), ie = in.end(); it != ie; ++it) { + char c = *it; if (escape == '{' || escape == '}') { if (c == escape) { *out++ = escape; } else { - assert(false); + throw std::runtime_error("expected brace"); } escape = '\0'; } else if (escape == '\\') { switch (c) { + case 'u': { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + it = Detail::parse_unicode_escape(it + 1, ie, out); + break; + } case 'n': { *out++ = '\n'; break; @@ -55,6 +97,10 @@ void unquote(std::string_view in, auto out, bool fstring = false) { *out++ = '\t'; break; } + case 'r': { + *out++ = '\r'; + break; + } case '\\': { *out++ = '\\'; break; @@ -64,8 +110,7 @@ void unquote(std::string_view in, auto out, bool fstring = false) { break; } default: { - assert(false); - break; + throw std::runtime_error("invalid escape sequence"); } } escape = '\0';