diff --git a/be/src/core/column/column_string.cpp b/be/src/core/column/column_string.cpp index caa49799d1ec61..af3f99fbe07acc 100644 --- a/be/src/core/column/column_string.cpp +++ b/be/src/core/column/column_string.cpp @@ -36,6 +36,7 @@ #include "util/simd/bits.h" #include "util/simd/vstring_function.h" #include "util/unaligned.h" +#include "util/utf8_check.h" namespace doris { template @@ -759,6 +760,20 @@ bool ColumnStr::is_ascii() const { return simd::VStringFunctions::is_ascii(StringRef(chars.data(), chars.size())); } +template +bool ColumnStr::is_valid_utf8() const { + const auto num_rows = offsets.size(); + const char* data = reinterpret_cast(chars.data()); + for (size_t i = 0; i < num_rows; ++i) { + auto str_offset = offset_at(i); + auto str_size = size_at(i); + if (!validate_utf8(data + str_offset, str_size)) { + return false; + } + } + return true; +} + template class ColumnStr; template class ColumnStr; } // namespace doris diff --git a/be/src/core/column/column_string.h b/be/src/core/column/column_string.h index 370d9710d6a5e6..4bf6e3ad95dc21 100644 --- a/be/src/core/column/column_string.h +++ b/be/src/core/column/column_string.h @@ -51,6 +51,9 @@ class Arena; class ColumnSorter; /** Column for String values. + * Note: In string functions, we assume that ColumnStr contains valid UTF-8 encoded data. + * However, ColumnStr is not guaranteed to always hold valid UTF-8, since it is also used + * as a serialization container where the content may be arbitrary binary data. */ template class ColumnStr final : public COWHelper> { @@ -536,6 +539,7 @@ class ColumnStr final : public COWHelper> { } bool is_ascii() const; + bool is_valid_utf8() const; Chars& get_chars() { return chars; } const Chars& get_chars() const { return chars; } diff --git a/be/src/exprs/function/function_string.cpp b/be/src/exprs/function/function_string.cpp index 8ad8d5316315c5..86c3e0f6a69e57 100644 --- a/be/src/exprs/function/function_string.cpp +++ b/be/src/exprs/function/function_string.cpp @@ -44,6 +44,7 @@ #include "exprs/function/string_hex_util.h" #include "util/string_search.hpp" #include "util/url_coding.h" +#include "util/utf8_check.h" namespace doris { struct NameStringASCII { @@ -225,6 +226,29 @@ struct StringUtf8LengthImpl { } }; +struct NameIsValidUTF8 { + static constexpr auto name = "is_valid_utf8"; +}; + +struct IsValidUTF8Impl { + using ReturnType = DataTypeUInt8; + static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING; + using Type = String; + using ReturnColumnType = ColumnUInt8; + + static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, + PaddedPODArray& res) { + auto size = offsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1]; + res[i] = validate_utf8(raw_str, str_size) ? 1 : 0; + } + return Status::OK(); + } +}; + struct NameStartsWith { static constexpr auto name = "starts_with"; }; @@ -1316,6 +1340,7 @@ using FunctionStringLength = FunctionUnaryToType; using FunctionStringUTF8Length = FunctionUnaryToType; using FunctionStringSpace = FunctionUnaryToType; +using FunctionIsValidUTF8 = FunctionUnaryToType; using FunctionStringStartsWith = FunctionBinaryToType; using FunctionStringEndsWith = @@ -1422,7 +1447,9 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function>(); factory.register_function>(); factory.register_function(); + factory.register_function(); + factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8"); factory.register_alias(FunctionToLower::name, "lcase"); factory.register_alias(FunctionToUpper::name, "ucase"); factory.register_alias(FunctionStringUTF8Length::name, "character_length"); diff --git a/be/test/core/column/column_string_test.cpp b/be/test/core/column/column_string_test.cpp index e8a08d952e01e3..d4fa2572427eed 100644 --- a/be/test/core/column/column_string_test.cpp +++ b/be/test/core/column/column_string_test.cpp @@ -1425,4 +1425,89 @@ TEST_F(ColumnStringTest, is_ascii) { } } +TEST_F(ColumnStringTest, is_valid_utf8) { + // all ASCII strings are valid UTF-8 + { + auto column = ColumnString::create(); + column->insert_data("hello", 5); + column->insert_data("world", 5); + column->insert_data("123!@#", 6); + EXPECT_TRUE(column->is_valid_utf8()); + } + // empty column is valid + { + auto column = ColumnString::create(); + EXPECT_TRUE(column->is_valid_utf8()); + } + // empty strings are valid UTF-8 + { + auto column = ColumnString::create(); + column->insert_data("", 0); + column->insert_data("", 0); + EXPECT_TRUE(column->is_valid_utf8()); + } + // multi-byte UTF-8 characters + { + auto column = ColumnString::create(); + column->insert_data("Hello, 世界", strlen("Hello, 世界")); + column->insert_data("こんにちは", strlen("こんにちは")); + column->insert_data("😀", strlen("😀")); + EXPECT_TRUE(column->is_valid_utf8()); + } + // invalid: lone continuation byte 0x80 + { + auto column = ColumnString::create(); + const char data[] = {'\x80'}; + column->insert_data(data, 1); + EXPECT_FALSE(column->is_valid_utf8()); + } + // invalid: bad 2-byte sequence 0xC3 0x28 + { + auto column = ColumnString::create(); + const char data[] = {'\xc3', '\x28'}; + column->insert_data(data, 2); + EXPECT_FALSE(column->is_valid_utf8()); + } + // invalid: overlong encoding 0xC0 0xAF + { + auto column = ColumnString::create(); + const char data[] = {'\xc0', '\xaf'}; + column->insert_data(data, 2); + EXPECT_FALSE(column->is_valid_utf8()); + } + // invalid: 0xFE byte + { + auto column = ColumnString::create(); + const char data[] = {'\xfe'}; + column->insert_data(data, 1); + EXPECT_FALSE(column->is_valid_utf8()); + } + // invalid: truncated 3-byte sequence 0xE4 0xB8 + { + auto column = ColumnString::create(); + const char data[] = {'\xe4', '\xb8'}; + column->insert_data(data, 2); + EXPECT_FALSE(column->is_valid_utf8()); + } + // mixed: one invalid byte makes the whole column invalid + { + auto column = ColumnString::create(); + column->insert_data("hello", 5); + const char bad[] = {'\xff'}; + column->insert_data(bad, 1); + column->insert_data("world", 5); + EXPECT_FALSE(column->is_valid_utf8()); + } + // cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when + // concatenated, but each row is invalid individually. Must validate per-row. + { + auto column = ColumnString::create(); + const char row1[] = {'\xe4'}; + const char row2[] = {'\xb8', '\x96'}; + column->insert_data(row1, 1); + column->insert_data(row2, 2); + EXPECT_FALSE(column->is_valid_utf8()); + } +} + } // namespace doris \ No newline at end of file diff --git a/be/test/exprs/function/function_string_test.cpp b/be/test/exprs/function/function_string_test.cpp index 7ca7b30596ecde..37231fd354cb38 100644 --- a/be/test/exprs/function/function_string_test.cpp +++ b/be/test/exprs/function/function_string_test.cpp @@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) { check_function_all_arg_comb(func_name, input_types, data_set); } +TEST(function_string_test, function_is_valid_utf8_test) { + std::string func_name = "is_valid_utf8"; + + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + + DataSet data_set = { + // valid UTF-8 strings + {{std::string("hello")}, std::uint8_t(1)}, + {{std::string("")}, std::uint8_t(1)}, + {{std::string("Hello, 世界")}, std::uint8_t(1)}, + {{std::string("こんにちは")}, std::uint8_t(1)}, + {{std::string("123!@#")}, std::uint8_t(1)}, + {{std::string("\xc3\xb1")}, std::uint8_t(1)}, // ñ + {{std::string("\xe2\x82\xac")}, std::uint8_t(1)}, // € + {{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀 + // invalid UTF-8 strings + {{std::string("\x80")}, std::uint8_t(0)}, // invalid leading byte + {{std::string("\xc3\x28")}, std::uint8_t(0)}, // invalid 2-byte sequence + {{std::string("\xe2\x28\xa1")}, std::uint8_t(0)}, // invalid 3-byte sequence + {{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid 4-byte sequence + {{std::string("\xfe")}, std::uint8_t(0)}, // invalid byte 0xFE + {{std::string("\xff")}, std::uint8_t(0)}, // invalid byte 0xFF + {{std::string("abc\xc0\xaf")}, std::uint8_t(0)}, // overlong encoding + // NULL + {{Null()}, Null()}, + }; + + check_function_all_arg_comb(func_name, input_types, data_set); +} + TEST(function_string_test, function_char_length_test) { std::string func_name = "char_length"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index f7b21c7dfbf095..bc95fdcb490f41 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -278,6 +278,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String; import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan; import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid; +import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains; @@ -837,6 +838,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(IsIpAddressInRange.class, "is_ip_address_in_range"), scalar(IsNan.class, "isnan"), scalar(IsUuid.class, "is_uuid"), + scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"), scalar(IsInf.class, "isinf"), scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"), scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java new file mode 100644 index 00000000000000..0c045182785337 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BooleanType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'is_valid_utf8'. + */ +public class IsValidUtf8 extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public IsValidUtf8(Expression arg) { + super("is_valid_utf8", arg); + } + + /** constructor for withChildren and reuse signature */ + private IsValidUtf8(ScalarFunctionParams functionParams) { + super(functionParams); + } + + /** + * withChildren. + */ + @Override + public IsValidUtf8 withChildren(List children) { + Preconditions.checkArgument(children.size() == 1); + return new IsValidUtf8(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitIsValidUtf8(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index a20abfeae853c7..f02579c4326c1a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -298,6 +298,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String; import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan; import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid; +import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains; @@ -1701,6 +1702,10 @@ default R visitIsUuid(IsUuid isUuid, C context) { return visitScalarFunction(isUuid, context); } + default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) { + return visitScalarFunction(isValidUtf8, context); + } + default R visitIsInf(IsInf isInf, C context) { return visitScalarFunction(isInf, context); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out new file mode 100644 index 00000000000000..20ad7bc05ca55b --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out @@ -0,0 +1,60 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !valid_1 -- +true + +-- !valid_2 -- +true + +-- !valid_3 -- +true + +-- !valid_4 -- +true + +-- !valid_5 -- +true + +-- !null_1 -- +\N + +-- !invalid_1 -- +false + +-- !invalid_2 -- +false + +-- !invalid_3 -- +false + +-- !invalid_4 -- +false + +-- !invalid_5 -- +false + +-- !invalid_6 -- +false + +-- !invalid_7 -- +false + +-- !invalid_8 -- +false + +-- !alias_1 -- +true + +-- !alias_2 -- +true + +-- !alias_3 -- +false + +-- !table_1 -- +1 true +2 true +3 true +4 \N +5 false +6 false + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy new file mode 100644 index 00000000000000..2883ab41976c1b --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_is_valid_utf8") { + // basic valid UTF-8 strings + qt_valid_1 "SELECT is_valid_utf8('hello');" + qt_valid_2 "SELECT is_valid_utf8('');" + qt_valid_3 "SELECT is_valid_utf8('Hello, 世界');" + qt_valid_4 "SELECT is_valid_utf8('こんにちは');" + qt_valid_5 "SELECT is_valid_utf8('123!@#');" + + // NULL handling + qt_null_1 "SELECT is_valid_utf8(NULL);" + + // invalid UTF-8 strings constructed via unhex + // 0x80: lone continuation byte + qt_invalid_1 "SELECT is_valid_utf8(unhex('80'));" + // 0xC3 0x28: invalid 2-byte sequence (second byte not continuation) + qt_invalid_2 "SELECT is_valid_utf8(unhex('C328'));" + // 0xE2 0x28 0xA1: invalid 3-byte sequence (second byte not continuation) + qt_invalid_3 "SELECT is_valid_utf8(unhex('E228A1'));" + // 0xF0 0x28 0x8C 0xBC: invalid 4-byte sequence (second byte not continuation) + qt_invalid_4 "SELECT is_valid_utf8(unhex('F0288CBC'));" + // 0xFE: not valid in UTF-8 + qt_invalid_5 "SELECT is_valid_utf8(unhex('FE'));" + // 0xFF: not valid in UTF-8 + qt_invalid_6 "SELECT is_valid_utf8(unhex('FF'));" + // overlong encoding of '/' (U+002F): 0xC0 0xAF + qt_invalid_7 "SELECT is_valid_utf8(unhex('C0AF'));" + // truncated 3-byte sequence: 0xE4 0xB8 + qt_invalid_8 "SELECT is_valid_utf8(unhex('E4B8'));" + + // alias isValidUTF8 + qt_alias_1 "SELECT isValidUTF8('hello');" + qt_alias_2 "SELECT isValidUTF8('');" + // alias with invalid bytes + qt_alias_3 "SELECT isValidUTF8(unhex('80'));" + + // test with table data (including invalid UTF-8 via unhex) + sql "DROP TABLE IF EXISTS test_is_valid_utf8_tbl" + sql """ + CREATE TABLE test_is_valid_utf8_tbl ( + id INT, + val VARCHAR(200) + ) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_is_valid_utf8_tbl VALUES + (1, 'hello'), + (2, ''), + (3, 'Hello, 世界'), + (4, NULL); + """ + sql "INSERT INTO test_is_valid_utf8_tbl VALUES (5, unhex('C0AF'));" + sql "INSERT INTO test_is_valid_utf8_tbl VALUES (6, unhex('FF'));" + + order_qt_table_1 "SELECT id, is_valid_utf8(val) FROM test_is_valid_utf8_tbl ORDER BY id;" + + // test fold const + testFoldConst("SELECT is_valid_utf8('hello');") + testFoldConst("SELECT is_valid_utf8('');") + testFoldConst("SELECT is_valid_utf8(NULL);") + testFoldConst("SELECT isValidUTF8('hello');") +}